From fc110202dffa06950716e0cc4535b07aaa2c439c Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 29 Aug 2024 08:00:25 -0700
Subject: [PATCH 01/72] [Support] Validate number of arguments passed to
 formatv() (#105745)

Change formatv() to validate that the number of arguments passed matches
number of replacement fields in the format string, and that the replacement
indices do not contain holes.

To support cases where this cannot be guaranteed, introduce a formatv()
overload that allows disabling validation with a bool flag as its first argument.
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   |  5 +-
 llvm/benchmarks/CMakeLists.txt                |  1 +
 llvm/benchmarks/FormatVariadicBM.cpp          | 63 ++++++++++++++
 llvm/include/llvm/Support/FormatVariadic.h    | 39 +++++----
 llvm/lib/Support/FormatVariadic.cpp           | 85 ++++++++++++++++---
 llvm/unittests/Support/FormatVariadicTest.cpp | 66 ++++++++------
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        |  4 +-
 7 files changed, 205 insertions(+), 58 deletions(-)
 create mode 100644 llvm/benchmarks/FormatVariadicBM.cpp
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 8f4bd17afc8581..4f30b2a0e7e7da 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -1401,7 +1401,10 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call,
         ErrnoNote =
             llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote);
     } else {
-      CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName);
+      // Disable formatv() validation as the case note may not always have the
+      // {0} placeholder for function name.
+      CaseNote =
+          llvm::formatv(false, Case.getNote().str().c_str(), FunctionName);
     }
     const SVal RV = Call.getReturnValue();
 
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 713d4ccd3c5975..e3366e6f3ffe19 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -5,3 +5,4 @@ set(LLVM_LINK_COMPONENTS
 add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
new file mode 100644
index 00000000000000..c03ead400d0d5c
--- /dev/null
+++ b/llvm/benchmarks/FormatVariadicBM.cpp
@@ -0,0 +1,63 @@
+//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace std;
+
+// Generate a list of format strings that have `NumReplacements` replacements
+// by permuting the replacements and some literal text.
+static vector<string> getFormatStrings(int NumReplacements) {
+  vector<string> Components;
+  for (int I = 0; I < NumReplacements; I++)
+    Components.push_back("{" + to_string(I) + "}");
+  // Intersperse these with some other literal text (_).
+  const string_view Literal = "____";
+  for (char C : Literal)
+    Components.push_back(string(1, C));
+
+  vector<string> Formats;
+  do {
+    string Concat;
+    for (const string &C : Components)
+      Concat += C;
+    Formats.emplace_back(Concat);
+  } while (next_permutation(Components.begin(), Components.end()));
+  return Formats;
+}
+
+// Generate the set of formats to exercise outside the benchmark code.
+static const vector<vector<string>> Formats = {
+    getFormatStrings(1), getFormatStrings(2), getFormatStrings(3),
+    getFormatStrings(4), getFormatStrings(5),
+};
+
+// Benchmark formatv() for a variety of format strings and 1-5 replacements.
+static void BM_FormatVariadic(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const string &Fmt : Formats[0])
+      formatv(Fmt.c_str(), 1).str();
+    for (const string &Fmt : Formats[1])
+      formatv(Fmt.c_str(), 1, 2).str();
+    for (const string &Fmt : Formats[2])
+      formatv(Fmt.c_str(), 1, 2, 3).str();
+    for (const string &Fmt : Formats[3])
+      formatv(Fmt.c_str(), 1, 2, 3, 4).str();
+    for (const string &Fmt : Formats[4])
+      formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str();
+  }
+}
+
+BENCHMARK(BM_FormatVariadic);
+
+BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index 595f2cf559a428..f31ad70021579e 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -67,23 +67,20 @@ class formatv_object_base {
 protected:
   StringRef Fmt;
   ArrayRef<support::detail::format_adapter *> Adapters;
-
-  static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                                 size_t &Align, char &Pad);
-
-  static std::pair<ReplacementItem, StringRef>
-  splitLiteralAndReplacement(StringRef Fmt);
+  bool Validate;
 
   formatv_object_base(StringRef Fmt,
-                      ArrayRef<support::detail::format_adapter *> Adapters)
-      : Fmt(Fmt), Adapters(Adapters) {}
+                      ArrayRef<support::detail::format_adapter *> Adapters,
+                      bool Validate)
+      : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {}
 
   formatv_object_base(formatv_object_base const &rhs) = delete;
   formatv_object_base(formatv_object_base &&rhs) = default;
 
 public:
   void format(raw_ostream &S) const {
-    for (auto &R : parseFormatString(Fmt)) {
+    const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate);
+    for (const auto &R : Replacements) {
       if (R.Type == ReplacementType::Empty)
         continue;
       if (R.Type == ReplacementType::Literal) {
@@ -101,9 +98,10 @@ class formatv_object_base {
       Align.format(S, R.Options);
     }
   }
-  static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt);
 
-  static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec);
+  // Parse and optionally validate format string (in debug builds).
+  static SmallVector<ReplacementItem, 2>
+  parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate);
 
   std::string str() const {
     std::string Result;
@@ -149,8 +147,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
   };
 
 public:
-  formatv_object(StringRef Fmt, Tuple &&Params)
-      : formatv_object_base(Fmt, ParameterPointers),
+  formatv_object(StringRef Fmt, Tuple &&Params, bool Validate)
+      : formatv_object_base(Fmt, ParameterPointers, Validate),
         Parameters(std::move(Params)) {
     ParameterPointers = std::apply(create_adapters(), Parameters);
   }
@@ -247,15 +245,22 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
 // assertion.  Otherwise, it will try to do something reasonable, but in general
 // the details of what that is are undefined.
 //
+
+// formatv() with validation enable/disable controlled by the first argument.
 template <typename... Ts>
-inline auto formatv(const char *Fmt, Ts &&...Vals)
+inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
     -> formatv_object<decltype(std::make_tuple(
         support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
   using ParamTuple = decltype(std::make_tuple(
       support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
-  return formatv_object<ParamTuple>(
-      Fmt, std::make_tuple(support::detail::build_format_adapter(
-               std::forward<Ts>(Vals))...));
+  auto Params = std::make_tuple(
+      support::detail::build_format_adapter(std::forward<Ts>(Vals))...);
+  return formatv_object<ParamTuple>(Fmt, std::move(Params), Validate);
+}
+
+// formatv() with validation enabled.
+template <typename... Ts> inline auto formatv(const char *Fmt, Ts &&...Vals) {
+  return formatv<Ts...>(true, Fmt, std::forward<Ts>(Vals)...);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index e25d036cdf1e8c..26d2b549136e43 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -25,8 +25,8 @@ static std::optional<AlignStyle> translateLocChar(char C) {
   LLVM_BUILTIN_UNREACHABLE;
 }
 
-bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                                             size_t &Align, char &Pad) {
+static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                               size_t &Align, char &Pad) {
   Where = AlignStyle::Right;
   Align = 0;
   Pad = ' ';
@@ -35,8 +35,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
 
   if (Spec.size() > 1) {
     // A maximum of 2 characters at the beginning can be used for something
-    // other
-    // than the width.
+    // other than the width.
     // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...]
     // contains the width.
     // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width.
@@ -55,8 +54,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
   return !Failed;
 }
 
-std::optional<ReplacementItem>
-formatv_object_base::parseReplacementItem(StringRef Spec) {
+static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
   StringRef RepString = Spec.trim("{}");
 
   // If the replacement sequence does not start with a non-negative integer,
@@ -82,15 +80,14 @@ formatv_object_base::parseReplacementItem(StringRef Spec) {
     RepString = StringRef();
   }
   RepString = RepString.trim();
-  if (!RepString.empty()) {
-    assert(false && "Unexpected characters found in replacement string!");
-  }
+  assert(RepString.empty() &&
+         "Unexpected characters found in replacement string!");
 
   return ReplacementItem{Spec, Index, Align, Where, Pad, Options};
 }
 
-std::pair<ReplacementItem, StringRef>
-formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
+static std::pair<ReplacementItem, StringRef>
+splitLiteralAndReplacement(StringRef Fmt) {
   while (!Fmt.empty()) {
     // Everything up until the first brace is a literal.
     if (Fmt.front() != '{') {
@@ -143,15 +140,77 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
   return std::make_pair(ReplacementItem{Fmt}, StringRef());
 }
 
+#ifndef NDEBUG
+#define ENABLE_VALIDATION 1
+#else
+#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode.
+#endif
+
 SmallVector<ReplacementItem, 2>
-formatv_object_base::parseFormatString(StringRef Fmt) {
+formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs,
+                                       bool Validate) {
   SmallVector<ReplacementItem, 2> Replacements;
-  ReplacementItem I;
+
+#if ENABLE_VALIDATION
+  const StringRef SavedFmtStr = Fmt;
+  size_t NumExpectedArgs = 0;
+#endif
+
   while (!Fmt.empty()) {
+    ReplacementItem I;
     std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt);
     if (I.Type != ReplacementType::Empty)
       Replacements.push_back(I);
+#if ENABLE_VALIDATION
+    if (I.Type == ReplacementType::Format)
+      NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1);
+#endif
+  }
+
+#if ENABLE_VALIDATION
+  if (!Validate)
+    return Replacements;
+
+  // Perform additional validation. Verify that the number of arguments matches
+  // the number of replacement indices and that there are no holes in the
+  // replacement indices.
+
+  // When validation fails, return an array of replacement items that
+  // will print an error message as the outout of this formatv() (used when
+  // validation is enabled in release mode).
+  auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) {
+    return SmallVector<ReplacementItem, 2>{
+        ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg),
+        ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)};
+  };
+
+  if (NumExpectedArgs != NumArgs) {
+    errs() << formatv(
+        "Expected {0} Args, but got {1} for format string '{2}'\n",
+        NumExpectedArgs, NumArgs, SavedFmtStr);
+    assert(0 && "Invalid formatv() call");
+    return getErrorReplacements("Unexpected number of arguments");
+  }
+
+  // Find the number of unique indices seen. All replacement indices
+  // are < NumExpectedArgs.
+  SmallVector<bool> Indices(NumExpectedArgs);
+  size_t Count = 0;
+  for (const ReplacementItem &I : Replacements) {
+    if (I.Type != ReplacementType::Format || Indices[I.Index])
+      continue;
+    Indices[I.Index] = true;
+    ++Count;
+  }
+
+  if (Count != NumExpectedArgs) {
+    errs() << formatv(
+        "Replacement field indices cannot have holes for format string '{0}'\n",
+        SavedFmtStr);
+    assert(0 && "Invalid format string");
+    return getErrorReplacements("Replacement indices have holes");
   }
+#endif // ENABLE_VALIDATION
   return Replacements;
 }
 
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index a78b25c53d7e43..4c648d87fc2de7 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -9,9 +9,11 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatAdapters.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
+using ::testing::HasSubstr;
 
 // Compile-time tests templates in the detail namespace.
 namespace {
@@ -35,14 +37,19 @@ struct NoFormat {};
 static_assert(uses_missing_provider<NoFormat>::value, "");
 }
 
+// Helper to parse format string with no validation.
+static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt) {
+  return formatv_object_base::parseFormatString(Fmt, 0, false);
+}
+
 TEST(FormatVariadicTest, EmptyFormatString) {
-  auto Replacements = formatv_object_base::parseFormatString("");
+  auto Replacements = parseFormatString("");
   EXPECT_EQ(0U, Replacements.size());
 }
 
 TEST(FormatVariadicTest, NoReplacements) {
   const StringRef kFormatString = "This is a test";
-  auto Replacements = formatv_object_base::parseFormatString(kFormatString);
+  auto Replacements = parseFormatString(kFormatString);
   ASSERT_EQ(1U, Replacements.size());
   EXPECT_EQ(kFormatString, Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -50,25 +57,25 @@ TEST(FormatVariadicTest, NoReplacements) {
 
 TEST(FormatVariadicTest, EscapedBrace) {
   // {{ should be replaced with {
-  auto Replacements = formatv_object_base::parseFormatString("{{");
+  auto Replacements = parseFormatString("{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // An even number N of braces should be replaced with N/2 braces.
-  Replacements = formatv_object_base::parseFormatString("{{{{{{");
+  Replacements = parseFormatString("{{{{{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{{{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = formatv_object_base::parseFormatString("}");
+  Replacements = parseFormatString("}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = formatv_object_base::parseFormatString("}}}");
+  Replacements = parseFormatString("}}}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}}}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -76,14 +83,14 @@ TEST(FormatVariadicTest, EscapedBrace) {
 
 TEST(FormatVariadicTest, ValidReplacementSequence) {
   // 1. Simple replacement - parameter index only
-  auto Replacements = formatv_object_base::parseFormatString("{0}");
+  auto Replacements = parseFormatString("{0}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
   EXPECT_EQ(0u, Replacements[0].Align);
   EXPECT_EQ("", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{1}");
+  Replacements = parseFormatString("{1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(1u, Replacements[0].Index);
@@ -92,7 +99,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 2. Parameter index with right alignment
-  Replacements = formatv_object_base::parseFormatString("{0,3}");
+  Replacements = parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -101,7 +108,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. And left alignment
-  Replacements = formatv_object_base::parseFormatString("{0,-3}");
+  Replacements = parseFormatString("{0,-3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -110,7 +117,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. And center alignment
-  Replacements = formatv_object_base::parseFormatString("{0,=3}");
+  Replacements = parseFormatString("{0,=3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -119,7 +126,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. Parameter index with option string
-  Replacements = formatv_object_base::parseFormatString("{0:foo}");
+  Replacements = parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -128,7 +135,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 5. Parameter index with alignment before option string
-  Replacements = formatv_object_base::parseFormatString("{0,-3:foo}");
+  Replacements = parseFormatString("{0,-3:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -137,7 +144,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 7. Parameter indices, options, and alignment can all have whitespace.
-  Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }");
+  Replacements = parseFormatString("{ 0, -3 : foo }");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -147,7 +154,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
   // 8. Everything after the first option specifier is part of the style, even
   // if it contains another option specifier.
-  Replacements = formatv_object_base::parseFormatString("{0:0:1}");
+  Replacements = parseFormatString("{0:0:1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0:0:1", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -157,7 +164,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0:1", Replacements[0].Options);
 
   // 9. Custom padding character
-  Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}");
+  Replacements = parseFormatString("{0,p+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,p+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -168,7 +175,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // Format string special characters are allowed as padding character
-  Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}");
+  Replacements = parseFormatString("{0,-+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,-+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -178,7 +185,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('-', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}");
+  Replacements = parseFormatString("{0,+-4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,+-4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -188,7 +195,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('+', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,==4:foo}");
+  Replacements = parseFormatString("{0,==4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,==4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -198,7 +205,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('=', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}");
+  Replacements = parseFormatString("{0,:=4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,:=4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -211,7 +218,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
 TEST(FormatVariadicTest, DefaultReplacementValues) {
   // 2. If options string is missing, it defaults to empty.
-  auto Replacements = formatv_object_base::parseFormatString("{0,3}");
+  auto Replacements = parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -219,7 +226,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // Including if the colon is present but contains no text.
-  Replacements = formatv_object_base::parseFormatString("{0,3:}");
+  Replacements = parseFormatString("{0,3:}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -227,7 +234,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. If alignment is missing, it defaults to 0, right, space
-  Replacements = formatv_object_base::parseFormatString("{0:foo}");
+  Replacements = parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
@@ -238,8 +245,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
 }
 
 TEST(FormatVariadicTest, MultipleReplacements) {
-  auto Replacements =
-      formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}");
+  auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}");
   ASSERT_EQ(5u, Replacements.size());
   // {0}
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -704,6 +710,16 @@ TEST(FormatVariadicTest, FormatFilterRange) {
   EXPECT_EQ("1, 2, 3", formatv("{0}", Range).str());
 }
 
+#ifdef NDEBUG // Disable the test in debug builds where it will assert.
+TEST(FormatVariadicTest, Validate) {
+  std::string Str = formatv("{0}", 1, 2).str();
+  EXPECT_THAT(Str, HasSubstr("Unexpected number of arguments"));
+
+  Str = formatv("{0} {2}", 1, 2, 3).str();
+  EXPECT_THAT(Str, HasSubstr("eplacement indices have holes"));
+}
+#endif // NDEBUG
+
 namespace {
 
 enum class Base { First };
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 82f8718fc556ad..7016fe41ca75d0 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
           dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode;
       TypeSwitch<FormatElement *>(dir->getArg())
           .Case<OperandVariable, ResultVariable>([&](auto operand) {
-            body << formatv(parserCode,
+            body << formatv(false, parserCode,
                             operand->getVar()->constraint.getCppType(),
                             listName);
           })
           .Default([&](auto operand) {
-            body << formatv(parserCode, "::mlir::Type", listName);
+            body << formatv(false, parserCode, "::mlir::Type", listName);
           });
     }
   } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {

From 025f03f01e8584140b7ac27422cea0c0ef7ef6c1 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 29 Aug 2024 17:05:02 +0200
Subject: [PATCH 02/72] [libc++][NFC] Remove unused struct in <string>
 (#106527)

---
 libcxx/include/string | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index 05d42afb7c9c3d..45be4050304125 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -3462,14 +3462,6 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat
 
 // find
 
-template <class _Traits>
-struct _LIBCPP_HIDDEN __traits_eq {
-  typedef typename _Traits::char_type char_type;
-  _LIBCPP_HIDE_FROM_ABI bool operator()(const char_type& __x, const char_type& __y) _NOEXCEPT {
-    return _Traits::eq(__x, __y);
-  }
-};
-
 template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type
 basic_string<_CharT, _Traits, _Allocator>::find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {

From a705e8cb5b071b3bf6d1d55629f18f6b7b9699ac Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 29 Aug 2024 17:05:56 +0200
Subject: [PATCH 03/72] [libc++][NFC] Remove __constexpr_is{nan,finite}
 (#106205)

They're never used in `constexpr` functions, so we can simply use
`std::isnan` and `std::isfinite` instead.
---
 libcxx/include/cmath                          | 28 -----------
 libcxx/include/complex                        | 50 +++++++++----------
 .../numerics/c.math/constexpr-fns.pass.cpp    |  2 -
 3 files changed, 24 insertions(+), 56 deletions(-)

diff --git a/libcxx/include/cmath b/libcxx/include/cmath
index 6480c4678ce33d..5d30b151870e0d 100644
--- a/libcxx/include/cmath
+++ b/libcxx/include/cmath
@@ -554,20 +554,6 @@ using ::scalbnl _LIBCPP_USING_IF_EXISTS;
 using ::tgammal _LIBCPP_USING_IF_EXISTS;
 using ::truncl _LIBCPP_USING_IF_EXISTS;
 
-template <class _A1, __enable_if_t<is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT {
-#if __has_builtin(__builtin_isnan)
-  return __builtin_isnan(__lcpp_x);
-#else
-  return isnan(__lcpp_x);
-#endif
-}
-
-template <class _A1, __enable_if_t<!is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT {
-  return std::isnan(__lcpp_x);
-}
-
 template <class _A1, __enable_if_t<is_floating_point<_A1>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT {
 #if __has_builtin(__builtin_isinf)
@@ -582,20 +568,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NO
   return std::isinf(__lcpp_x);
 }
 
-template <class _A1, __enable_if_t<is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT {
-#if __has_builtin(__builtin_isfinite)
-  return __builtin_isfinite(__lcpp_x);
-#else
-  return isfinite(__lcpp_x);
-#endif
-}
-
-template <class _A1, __enable_if_t<!is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT {
-  return __builtin_isfinite(__lcpp_x);
-}
-
 #if _LIBCPP_STD_VER >= 20
 template <typename _Fp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept {
diff --git a/libcxx/include/complex b/libcxx/include/complex
index e6534025de57e5..94fd8ee347dffb 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -1019,9 +1019,9 @@ inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_Co
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp()) {
-  if (std::__constexpr_isnan(__rho) || std::signbit(__rho))
+  if (std::isnan(__rho) || std::signbit(__rho))
     return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  if (std::__constexpr_isnan(__theta)) {
+  if (std::isnan(__theta)) {
     if (std::__constexpr_isinf(__rho))
       return complex<_Tp>(__rho, __theta);
     return complex<_Tp>(__theta, __theta);
@@ -1032,10 +1032,10 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta =
     return complex<_Tp>(_Tp(NAN), _Tp(NAN));
   }
   _Tp __x = __rho * std::cos(__theta);
-  if (std::__constexpr_isnan(__x))
+  if (std::isnan(__x))
     __x = 0;
   _Tp __y = __rho * std::sin(__theta);
-  if (std::__constexpr_isnan(__y))
+  if (std::isnan(__y))
     __y = 0;
   return complex<_Tp>(__x, __y);
 }
@@ -1062,10 +1062,8 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> sqrt(const complex<_Tp>& __x) {
     return complex<_Tp>(_Tp(INFINITY), __x.imag());
   if (std::__constexpr_isinf(__x.real())) {
     if (__x.real() > _Tp(0))
-      return complex<_Tp>(
-          __x.real(), std::__constexpr_isnan(__x.imag()) ? __x.imag() : std::copysign(_Tp(0), __x.imag()));
-    return complex<_Tp>(
-        std::__constexpr_isnan(__x.imag()) ? __x.imag() : _Tp(0), std::copysign(__x.real(), __x.imag()));
+      return complex<_Tp>(__x.real(), std::isnan(__x.imag()) ? __x.imag() : std::copysign(_Tp(0), __x.imag()));
+    return complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0), std::copysign(__x.real(), __x.imag()));
   }
   return std::polar(std::sqrt(std::abs(__x)), std::arg(__x) / _Tp(2));
 }
@@ -1080,9 +1078,9 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> exp(const complex<_Tp>& __x) {
   }
   if (std::__constexpr_isinf(__x.real())) {
     if (__x.real() < _Tp(0)) {
-      if (!std::__constexpr_isfinite(__i))
+      if (!std::isfinite(__i))
         __i = _Tp(1);
-    } else if (__i == 0 || !std::__constexpr_isfinite(__i)) {
+    } else if (__i == 0 || !std::isfinite(__i)) {
       if (std::__constexpr_isinf(__i))
         __i = _Tp(NAN);
       return complex<_Tp>(__x.real(), __i);
@@ -1131,13 +1129,13 @@ template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x) {
   const _Tp __pi(atan2(+0., -0.));
   if (std::__constexpr_isinf(__x.real())) {
-    if (std::__constexpr_isnan(__x.imag()))
+    if (std::isnan(__x.imag()))
       return __x;
     if (std::__constexpr_isinf(__x.imag()))
       return complex<_Tp>(__x.real(), std::copysign(__pi * _Tp(0.25), __x.imag()));
     return complex<_Tp>(__x.real(), std::copysign(_Tp(0), __x.imag()));
   }
-  if (std::__constexpr_isnan(__x.real())) {
+  if (std::isnan(__x.real())) {
     if (std::__constexpr_isinf(__x.imag()))
       return complex<_Tp>(__x.imag(), __x.real());
     if (__x.imag() == 0)
@@ -1156,7 +1154,7 @@ template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) {
   const _Tp __pi(atan2(+0., -0.));
   if (std::__constexpr_isinf(__x.real())) {
-    if (std::__constexpr_isnan(__x.imag()))
+    if (std::isnan(__x.imag()))
       return complex<_Tp>(std::abs(__x.real()), __x.imag());
     if (std::__constexpr_isinf(__x.imag())) {
       if (__x.real() > 0)
@@ -1168,7 +1166,7 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) {
       return complex<_Tp>(-__x.real(), std::copysign(__pi, __x.imag()));
     return complex<_Tp>(__x.real(), std::copysign(_Tp(0), __x.imag()));
   }
-  if (std::__constexpr_isnan(__x.real())) {
+  if (std::isnan(__x.real())) {
     if (std::__constexpr_isinf(__x.imag()))
       return complex<_Tp>(std::abs(__x.imag()), __x.real());
     return complex<_Tp>(__x.real(), __x.real());
@@ -1187,12 +1185,12 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) {
   if (std::__constexpr_isinf(__x.imag())) {
     return complex<_Tp>(std::copysign(_Tp(0), __x.real()), std::copysign(__pi / _Tp(2), __x.imag()));
   }
-  if (std::__constexpr_isnan(__x.imag())) {
+  if (std::isnan(__x.imag())) {
     if (std::__constexpr_isinf(__x.real()) || __x.real() == 0)
       return complex<_Tp>(std::copysign(_Tp(0), __x.real()), __x.imag());
     return complex<_Tp>(__x.imag(), __x.imag());
   }
-  if (std::__constexpr_isnan(__x.real())) {
+  if (std::isnan(__x.real())) {
     return complex<_Tp>(__x.real(), __x.real());
   }
   if (std::__constexpr_isinf(__x.real())) {
@@ -1209,11 +1207,11 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) {
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) {
-  if (std::__constexpr_isinf(__x.real()) && !std::__constexpr_isfinite(__x.imag()))
+  if (std::__constexpr_isinf(__x.real()) && !std::isfinite(__x.imag()))
     return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.real() == 0 && !std::__constexpr_isfinite(__x.imag()))
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
     return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.imag() == 0 && !std::__constexpr_isfinite(__x.real()))
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
     return __x;
   return complex<_Tp>(std::sinh(__x.real()) * std::cos(__x.imag()), std::cosh(__x.real()) * std::sin(__x.imag()));
 }
@@ -1222,13 +1220,13 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) {
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) {
-  if (std::__constexpr_isinf(__x.real()) && !std::__constexpr_isfinite(__x.imag()))
+  if (std::__constexpr_isinf(__x.real()) && !std::isfinite(__x.imag()))
     return complex<_Tp>(std::abs(__x.real()), _Tp(NAN));
-  if (__x.real() == 0 && !std::__constexpr_isfinite(__x.imag()))
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
     return complex<_Tp>(_Tp(NAN), __x.real());
   if (__x.real() == 0 && __x.imag() == 0)
     return complex<_Tp>(_Tp(1), __x.imag());
-  if (__x.imag() == 0 && !std::__constexpr_isfinite(__x.real()))
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
     return complex<_Tp>(std::abs(__x.real()), __x.imag());
   return complex<_Tp>(std::cosh(__x.real()) * std::cos(__x.imag()), std::sinh(__x.real()) * std::sin(__x.imag()));
 }
@@ -1238,11 +1236,11 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) {
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x) {
   if (std::__constexpr_isinf(__x.real())) {
-    if (!std::__constexpr_isfinite(__x.imag()))
+    if (!std::isfinite(__x.imag()))
       return complex<_Tp>(std::copysign(_Tp(1), __x.real()), _Tp(0));
     return complex<_Tp>(std::copysign(_Tp(1), __x.real()), std::copysign(_Tp(0), std::sin(_Tp(2) * __x.imag())));
   }
-  if (std::__constexpr_isnan(__x.real()) && __x.imag() == 0)
+  if (std::isnan(__x.real()) && __x.imag() == 0)
     return __x;
   _Tp __2r(_Tp(2) * __x.real());
   _Tp __2i(_Tp(2) * __x.imag());
@@ -1267,7 +1265,7 @@ template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) {
   const _Tp __pi(atan2(+0., -0.));
   if (std::__constexpr_isinf(__x.real())) {
-    if (std::__constexpr_isnan(__x.imag()))
+    if (std::isnan(__x.imag()))
       return complex<_Tp>(__x.imag(), __x.real());
     if (std::__constexpr_isinf(__x.imag())) {
       if (__x.real() < _Tp(0))
@@ -1278,7 +1276,7 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) {
       return complex<_Tp>(__pi, std::signbit(__x.imag()) ? -__x.real() : __x.real());
     return complex<_Tp>(_Tp(0), std::signbit(__x.imag()) ? __x.real() : -__x.real());
   }
-  if (std::__constexpr_isnan(__x.real())) {
+  if (std::isnan(__x.real())) {
     if (std::__constexpr_isinf(__x.imag()))
       return complex<_Tp>(__x.real(), -__x.imag());
     return complex<_Tp>(__x.real(), __x.real());
diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp
index 3739bc6ef04dd0..ff36293830c5de 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp
@@ -20,9 +20,7 @@
 
 #include "test_macros.h"
 
-static_assert(std::__constexpr_isnan(0.) == false, "");
 static_assert(std::__constexpr_isinf(0.0) == false, "");
-static_assert(std::__constexpr_isfinite(0.0) == true, "");
 
 int main(int, char**)
 {

From 032c3283ab419377a1230a32d98693b528f63134 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 29 Aug 2024 08:06:45 -0700
Subject: [PATCH 04/72] [NFC][TableGen] Refactor IntrinsicEmitter code
 (#106479)

- Use formatv() and raw string literals to simplify emission code.
- Use range based for loops and structured bindings to simplify loops.
- Use const Pointers to Records.
- Rename `ComputeFixedEncoding` to `ComputeTypeSignature` to reflect
  what the function actually does, cnd change it to return a vector.
- Use reverse() and range based for loop to pack 8 nibbles into 32-bits.
- Rename some variables to follow LLVM coding standards.
- For function memory effects, print human readable effects in comment.
---
 llvm/test/TableGen/intrinsic-attrs.td    |   2 +-
 llvm/utils/TableGen/IntrinsicEmitter.cpp | 393 ++++++++++++-----------
 2 files changed, 204 insertions(+), 191 deletions(-)

diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td
index 22019b8fb87140..29e8cb1e89bb01 100644
--- a/llvm/test/TableGen/intrinsic-attrs.td
+++ b/llvm/test/TableGen/intrinsic-attrs.td
@@ -60,7 +60,7 @@ def int_deref_ptr_ret : Intrinsic<[llvm_ptr_ty], [], [Dereferenceable<RetIndex,
 // CHECK-NEXT: default: llvm_unreachable("Invalid attribute set number");
 // CHECK-NEXT: case 0:
 // CHECK-NEXT:     return AttributeSet::get(C, {
-// CHECK-NEXT: Attribute::get(C, Attribute::Dereferenceable, 16),
+// CHECK-NEXT:       Attribute::get(C, Attribute::Dereferenceable, 16),
 // CHECK-NEXT: });
 // CHECK-NEXT: }
 // CHECK-NEXT: }
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index af8ff889918428..06b430ae011405 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -144,35 +144,40 @@ void IntrinsicEmitter::EmitEnumInfo(const CodeGenIntrinsicTable &Ints,
     OS << "#ifdef GET_INTRINSIC_ENUM_VALUES\n";
   } else {
     std::string UpperPrefix = StringRef(IntrinsicPrefix).upper();
-    OS << "#ifndef LLVM_IR_INTRINSIC_" << UpperPrefix << "_ENUMS_H\n";
-    OS << "#define LLVM_IR_INTRINSIC_" << UpperPrefix << "_ENUMS_H\n\n";
+    OS << formatv("#ifndef LLVM_IR_INTRINSIC_{0}_ENUMS_H\n", UpperPrefix);
+    OS << formatv("#define LLVM_IR_INTRINSIC_{0}_ENUMS_H\n", UpperPrefix);
     OS << "namespace llvm::Intrinsic {\n";
-    OS << "enum " << UpperPrefix << "Intrinsics : unsigned {\n";
+    OS << formatv("enum {0}Intrinsics : unsigned {{\n", UpperPrefix);
   }
 
   OS << "// Enum values for intrinsics.\n";
-  for (unsigned i = Set->Offset, e = Set->Offset + Set->Count; i != e; ++i) {
-    OS << "    " << Ints[i].EnumName;
+  bool First = true;
+  for (const auto &Int : ArrayRef(&Ints[Set->Offset], Set->Count)) {
+    OS << "    " << Int.EnumName;
 
     // Assign a value to the first intrinsic in this target set so that all
     // intrinsic ids are distinct.
-    if (i == Set->Offset)
-      OS << " = " << (Set->Offset + 1);
+    if (First) {
+      OS << " = " << Set->Offset + 1;
+      First = false;
+    }
 
     OS << ", ";
-    if (Ints[i].EnumName.size() < 40)
-      OS.indent(40 - Ints[i].EnumName.size());
-    OS << " // " << Ints[i].Name << "\n";
+    if (Int.EnumName.size() < 40)
+      OS.indent(40 - Int.EnumName.size());
+    OS << formatv(" // {0}\n", Int.Name);
   }
 
   // Emit num_intrinsics into the target neutral enum.
   if (IntrinsicPrefix.empty()) {
-    OS << "    num_intrinsics = " << (Ints.size() + 1) << "\n";
+    OS << formatv("    num_intrinsics = {0}\n", Ints.size() + 1);
     OS << "#endif\n\n";
   } else {
-    OS << "}; // enum\n";
-    OS << "} // namespace llvm::Intrinsic\n\n";
-    OS << "#endif\n";
+    OS << R"(}; // enum
+} // namespace llvm::Intrinsic
+#endif
+
+)";
   }
 }
 
@@ -181,8 +186,8 @@ void IntrinsicEmitter::EmitArgKind(raw_ostream &OS) {
     return;
   OS << "// llvm::Intrinsic::IITDescriptor::ArgKind.\n";
   OS << "#ifdef GET_INTRINSIC_ARGKIND\n";
-  if (auto RecArgKind = Records.getDef("ArgKind")) {
-    for (auto &RV : RecArgKind->getValues())
+  if (const auto RecArgKind = Records.getDef("ArgKind")) {
+    for (const auto &RV : RecArgKind->getValues())
       OS << "    AK_" << RV.getName() << " = " << *RV.getValue() << ",\n";
   } else {
     OS << "#error \"ArgKind is not defined\"\n";
@@ -194,7 +199,7 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) {
   OS << "#ifdef GET_INTRINSIC_IITINFO\n";
   std::array<StringRef, 256> RecsByNumber;
   auto IIT_Base = Records.getAllDerivedDefinitionsIfDefined("IIT_Base");
-  for (auto Rec : IIT_Base) {
+  for (const Record *Rec : IIT_Base) {
     auto Number = Rec->getValueAsInt("Number");
     assert(0 <= Number && Number < (int)RecsByNumber.size() &&
            "IIT_Info.Number should be uint8_t");
@@ -213,26 +218,29 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) {
 
 void IntrinsicEmitter::EmitTargetInfo(const CodeGenIntrinsicTable &Ints,
                                       raw_ostream &OS) {
-  OS << "// Target mapping.\n";
-  OS << "#ifdef GET_INTRINSIC_TARGET_DATA\n";
-  OS << "struct IntrinsicTargetInfo {\n"
-     << "  llvm::StringLiteral Name;\n"
-     << "  size_t Offset;\n"
-     << "  size_t Count;\n"
-     << "};\n";
-  OS << "static constexpr IntrinsicTargetInfo TargetInfos[] = {\n";
-  for (const auto &Target : Ints.Targets)
-    OS << "  {llvm::StringLiteral(\"" << Target.Name << "\"), " << Target.Offset
-       << ", " << Target.Count << "},\n";
-  OS << "};\n";
-  OS << "#endif\n\n";
+  OS << R"(// Target mapping.
+#ifdef GET_INTRINSIC_TARGET_DATA
+struct IntrinsicTargetInfo {
+  StringLiteral Name;
+  size_t Offset;
+  size_t Count;
+};
+static constexpr IntrinsicTargetInfo TargetInfos[] = {
+)";
+  for (const auto [Name, Offset, Count] : Ints.Targets)
+    OS << formatv("  {{\"{0}\", {1}, {2}},\n", Name, Offset, Count);
+  OS << R"(};
+#endif
+
+)";
 }
 
 void IntrinsicEmitter::EmitIntrinsicToNameTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
-  OS << "// Intrinsic ID to name table.\n";
-  OS << "#ifdef GET_INTRINSIC_NAME_TABLE\n";
-  OS << "  // Note that entry #0 is the invalid intrinsic!\n";
+  OS << R"(// Intrinsic ID to name table.
+#ifdef GET_INTRINSIC_NAME_TABLE
+// Note that entry #0 is the invalid intrinsic!
+)";
   for (const auto &Int : Ints)
     OS << "  \"" << Int.Name << "\",\n";
   OS << "#endif\n\n";
@@ -240,16 +248,19 @@ void IntrinsicEmitter::EmitIntrinsicToNameTable(
 
 void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
-  OS << "// Intrinsic ID to overload bitset.\n";
-  OS << "#ifdef GET_INTRINSIC_OVERLOAD_TABLE\n";
-  OS << "static constexpr uint8_t OTable[] = {\n";
-  OS << "  0";
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
+  OS << R"(// Intrinsic ID to overload bitset.
+#ifdef GET_INTRINSIC_OVERLOAD_TABLE
+static constexpr uint8_t OTable[] = {
+  0
+  )";
+  for (auto [I, Int] : enumerate(Ints)) {
     // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    if ((i + 1) % 8 == 0)
+    size_t Idx = I + 1;
+
+    if (Idx % 8 == 0)
       OS << ",\n  0";
-    if (Ints[i].isOverloaded)
-      OS << " | (1<<" << (i + 1) % 8 << ')';
+    if (Int.isOverloaded)
+      OS << " | (1<<" << Idx % 8 << ')';
   }
   OS << "\n};\n\n";
   // OTable contains a true bit at the position if the intrinsic is overloaded.
@@ -257,20 +268,18 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
   OS << "#endif\n\n";
 }
 
-/// ComputeFixedEncoding - If we can encode the type signature for this
-/// intrinsic into 32 bits, return it.  If not, return ~0U.
-static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
-                                 std::vector<unsigned char> &TypeSig) {
-  if (auto *R = Int.TheDef->getValue("TypeSig")) {
-    for (auto &a : cast<ListInit>(R->getValue())->getValues()) {
-      for (auto &b : cast<ListInit>(a)->getValues())
-        TypeSig.push_back(cast<IntInit>(b)->getValue());
+using TypeSigTy = SmallVector<unsigned char>;
+
+/// Computes type signature of the intrinsic \p Int.
+static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) {
+  TypeSigTy TypeSig;
+  if (const auto *R = Int.TheDef->getValue("TypeSig")) {
+    for (const auto *a : cast<ListInit>(R->getValue())->getValues()) {
+      for (const auto *b : cast<ListInit>(a)->getValues())
+        TypeSig.emplace_back(cast<IntInit>(b)->getValue());
     }
   }
-}
-
-static void printIITEntry(raw_ostream &OS, unsigned char X) {
-  OS << (unsigned)X;
+  return TypeSig;
 }
 
 void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
@@ -278,29 +287,28 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
   // If we can compute a 32-bit fixed encoding for this intrinsic, do so and
   // capture it in this vector, otherwise store a ~0U.
   std::vector<unsigned> FixedEncodings;
+  SequenceToOffsetTable<TypeSigTy> LongEncodingTable;
 
-  SequenceToOffsetTable<std::vector<unsigned char>> LongEncodingTable;
-
-  std::vector<unsigned char> TypeSig;
+  FixedEncodings.reserve(Ints.size());
 
   // Compute the unique argument type info.
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
+  for (const CodeGenIntrinsic &Int : Ints) {
     // Get the signature for the intrinsic.
-    TypeSig.clear();
-    ComputeFixedEncoding(Ints[i], TypeSig);
+    TypeSigTy TypeSig = ComputeTypeSignature(Int);
 
-    // Check to see if we can encode it into a 32-bit word.  We can only encode
+    // Check to see if we can encode it into a 32-bit word. We can only encode
     // 8 nibbles into a 32-bit word.
     if (TypeSig.size() <= 8) {
-      bool Failed = false;
+      // Attempt to pack elements of TypeSig into a 32-bit word, starting from
+      // the most significant nibble.
       unsigned Result = 0;
-      for (unsigned i = 0, e = TypeSig.size(); i != e; ++i) {
-        // If we had an unencodable argument, bail out.
-        if (TypeSig[i] > 15) {
+      bool Failed = false;
+      for (unsigned char C : reverse(TypeSig)) {
+        if (C > 15) {
           Failed = true;
           break;
         }
-        Result = (Result << 4) | TypeSig[e - i - 1];
+        Result = (Result << 4) | C;
       }
 
       // If this could be encoded into a 31-bit word, return it.
@@ -320,23 +328,22 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
 
   LongEncodingTable.layout();
 
-  OS << "// Global intrinsic function declaration type table.\n";
-  OS << "#ifdef GET_INTRINSIC_GENERATOR_GLOBAL\n";
+  OS << R"(// Global intrinsic function declaration type table.
+#ifdef GET_INTRINSIC_GENERATOR_GLOBAL
+static constexpr unsigned IIT_Table[] = {
+  )";
 
-  OS << "static constexpr unsigned IIT_Table[] = {\n  ";
-
-  for (unsigned i = 0, e = FixedEncodings.size(); i != e; ++i) {
-    if ((i & 7) == 7)
+  for (auto [Idx, FixedEncoding, Int] : enumerate(FixedEncodings, Ints)) {
+    if ((Idx & 7) == 7)
       OS << "\n  ";
 
     // If the entry fit in the table, just emit it.
-    if (FixedEncodings[i] != ~0U) {
-      OS << "0x" << Twine::utohexstr(FixedEncodings[i]) << ", ";
+    if (FixedEncoding != ~0U) {
+      OS << "0x" << Twine::utohexstr(FixedEncoding) << ", ";
       continue;
     }
 
-    TypeSig.clear();
-    ComputeFixedEncoding(Ints[i], TypeSig);
+    TypeSigTy TypeSig = ComputeTypeSignature(Int);
 
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
@@ -348,7 +355,8 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
   // Emit the shared table of register lists.
   OS << "static constexpr unsigned char IIT_LongEncodingTable[] = {\n";
   if (!LongEncodingTable.empty())
-    LongEncodingTable.emit(OS, printIITEntry);
+    LongEncodingTable.emit(
+        OS, [](raw_ostream &OS, unsigned char C) { OS << (unsigned)C; });
   OS << "  255\n};\n\n";
 
   OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
@@ -399,16 +407,14 @@ struct AttributeComparator {
 /// EmitAttributes - This emits the Intrinsic::getAttributes method.
 void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
                                       raw_ostream &OS) {
-  OS << "// Add parameter attributes that are not common to all intrinsics.\n";
-  OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n";
-
+  OS << R"(// Add parameter attributes that are not common to all intrinsics.
+#ifdef GET_INTRINSIC_ATTRIBUTES
+static AttributeSet getIntrinsicArgAttributeSet(LLVMContext &C, unsigned ID) {
+  switch (ID) {
+  default: llvm_unreachable("Invalid attribute set number");)";
   // Compute unique argument attribute sets.
   std::map<SmallVector<CodeGenIntrinsic::ArgAttribute, 0>, unsigned>
       UniqArgAttributes;
-  OS << "static AttributeSet getIntrinsicArgAttributeSet("
-     << "LLVMContext &C, unsigned ID) {\n"
-     << "  switch (ID) {\n"
-     << "  default: llvm_unreachable(\"Invalid attribute set number\");\n";
   for (const CodeGenIntrinsic &Int : Ints) {
     for (auto &Attrs : Int.ArgumentAttributes) {
       if (Attrs.empty())
@@ -419,118 +425,127 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
         continue;
 
       assert(is_sorted(Attrs) && "Argument attributes are not sorted");
-
-      OS << "  case " << ID << ":\n";
-      OS << "    return AttributeSet::get(C, {\n";
-      for (const CodeGenIntrinsic::ArgAttribute &Attr : Attrs) {
-        switch (Attr.Kind) {
+      auto getAttrEnumName =
+          [](CodeGenIntrinsic::ArgAttrKind Kind) -> StringRef {
+        switch (Kind) {
         case CodeGenIntrinsic::NoCapture:
-          OS << "      Attribute::get(C, Attribute::NoCapture),\n";
-          break;
+          return "NoCapture";
         case CodeGenIntrinsic::NoAlias:
-          OS << "      Attribute::get(C, Attribute::NoAlias),\n";
-          break;
+          return "NoAlias";
         case CodeGenIntrinsic::NoUndef:
-          OS << "      Attribute::get(C, Attribute::NoUndef),\n";
-          break;
+          return "NoUndef";
         case CodeGenIntrinsic::NonNull:
-          OS << "      Attribute::get(C, Attribute::NonNull),\n";
-          break;
+          return "NonNull";
         case CodeGenIntrinsic::Returned:
-          OS << "      Attribute::get(C, Attribute::Returned),\n";
-          break;
+          return "Returned";
         case CodeGenIntrinsic::ReadOnly:
-          OS << "      Attribute::get(C, Attribute::ReadOnly),\n";
-          break;
+          return "ReadOnly";
         case CodeGenIntrinsic::WriteOnly:
-          OS << "      Attribute::get(C, Attribute::WriteOnly),\n";
-          break;
+          return "WriteOnly";
         case CodeGenIntrinsic::ReadNone:
-          OS << "      Attribute::get(C, Attribute::ReadNone),\n";
-          break;
+          return "ReadNone";
         case CodeGenIntrinsic::ImmArg:
-          OS << "      Attribute::get(C, Attribute::ImmArg),\n";
-          break;
+          return "ImmArg";
         case CodeGenIntrinsic::Alignment:
-          OS << "      Attribute::get(C, Attribute::Alignment, " << Attr.Value
-             << "),\n";
-          break;
+          return "Alignment";
         case CodeGenIntrinsic::Dereferenceable:
-          OS << "      Attribute::get(C, Attribute::Dereferenceable, "
-             << Attr.Value << "),\n";
-          break;
+          return "Dereferenceable";
         }
+      };
+
+      OS << formatv(R"(
+  case {0}:
+    return AttributeSet::get(C, {{
+)",
+                    ID);
+      for (const CodeGenIntrinsic::ArgAttribute &Attr : Attrs) {
+        StringRef AttrName = getAttrEnumName(Attr.Kind);
+        if (Attr.Kind == CodeGenIntrinsic::Alignment ||
+            Attr.Kind == CodeGenIntrinsic::Dereferenceable)
+          OS << formatv("      Attribute::get(C, Attribute::{0}, {1}),\n",
+                        AttrName, Attr.Value);
+        else
+          OS << formatv("      Attribute::get(C, Attribute::{0}),\n", AttrName);
       }
-      OS << "    });\n";
+      OS << "    });";
     }
   }
-  OS << "  }\n";
-  OS << "}\n\n";
+  OS << R"(
+  }
+} // getIntrinsicArgAttributeSet)";
 
   // Compute unique function attribute sets.
   std::map<const CodeGenIntrinsic *, unsigned, FnAttributeComparator>
       UniqFnAttributes;
-  OS << "static AttributeSet getIntrinsicFnAttributeSet("
-     << "LLVMContext &C, unsigned ID) {\n"
-     << "  switch (ID) {\n"
-     << "  default: llvm_unreachable(\"Invalid attribute set number\");\n";
+  OS << R"(
+static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) {
+  switch (ID) {
+    default: llvm_unreachable("Invalid attribute set number");)";
   for (const CodeGenIntrinsic &Intrinsic : Ints) {
     unsigned ID = UniqFnAttributes.size();
     if (!UniqFnAttributes.try_emplace(&Intrinsic, ID).second)
       continue;
-
-    OS << "  case " << ID << ":\n"
-       << "    return AttributeSet::get(C, {\n";
+    OS << formatv(R"(
+  case {0}:
+    return AttributeSet::get(C, {{
+)",
+                  ID);
+    auto addAttribute = [&OS](StringRef Attr) {
+      OS << formatv("      Attribute::get(C, Attribute::{0}),\n", Attr);
+    };
     if (!Intrinsic.canThrow)
-      OS << "      Attribute::get(C, Attribute::NoUnwind),\n";
+      addAttribute("NoUnwind");
     if (Intrinsic.isNoReturn)
-      OS << "      Attribute::get(C, Attribute::NoReturn),\n";
+      addAttribute("NoReturn");
     if (Intrinsic.isNoCallback)
-      OS << "      Attribute::get(C, Attribute::NoCallback),\n";
+      addAttribute("NoCallback");
     if (Intrinsic.isNoSync)
-      OS << "      Attribute::get(C, Attribute::NoSync),\n";
+      addAttribute("NoSync");
     if (Intrinsic.isNoFree)
-      OS << "      Attribute::get(C, Attribute::NoFree),\n";
+      addAttribute("NoFree");
     if (Intrinsic.isWillReturn)
-      OS << "      Attribute::get(C, Attribute::WillReturn),\n";
+      addAttribute("WillReturn");
     if (Intrinsic.isCold)
-      OS << "      Attribute::get(C, Attribute::Cold),\n";
+      addAttribute("Cold");
     if (Intrinsic.isNoDuplicate)
-      OS << "      Attribute::get(C, Attribute::NoDuplicate),\n";
+      addAttribute("NoDuplicate");
     if (Intrinsic.isNoMerge)
-      OS << "      Attribute::get(C, Attribute::NoMerge),\n";
+      addAttribute("NoMerge");
     if (Intrinsic.isConvergent)
-      OS << "      Attribute::get(C, Attribute::Convergent),\n";
+      addAttribute("Convergent");
     if (Intrinsic.isSpeculatable)
-      OS << "      Attribute::get(C, Attribute::Speculatable),\n";
+      addAttribute("Speculatable");
     if (Intrinsic.isStrictFP)
-      OS << "      Attribute::get(C, Attribute::StrictFP),\n";
+      addAttribute("StrictFP");
 
     MemoryEffects ME = Intrinsic.ME;
     // TODO: IntrHasSideEffects should affect not only readnone intrinsics.
     if (ME.doesNotAccessMemory() && Intrinsic.hasSideEffects)
       ME = MemoryEffects::unknown();
     if (ME != MemoryEffects::unknown()) {
-      OS << "      Attribute::getWithMemoryEffects(C, "
-         << "MemoryEffects::createFromIntValue(" << ME.toIntValue() << ")),\n";
+      OS << formatv("      // {0}\n", ME);
+      OS << formatv("      Attribute::getWithMemoryEffects(C, "
+                    "MemoryEffects::createFromIntValue({0})),\n",
+                    ME.toIntValue());
     }
     OS << "    });\n";
   }
-  OS << "  }\n";
-  OS << "}\n\n";
-  OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
+  OS << R"(  }
+} // getIntrinsicFnAttributeSet
+
+AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {
+)";
 
-  // Compute the maximum number of attribute arguments and the map
+  // Compute the maximum number of attribute arguments and the map.
   typedef std::map<const CodeGenIntrinsic *, unsigned, AttributeComparator>
       UniqAttrMapTy;
   UniqAttrMapTy UniqAttributes;
-  unsigned maxArgAttrs = 0;
+  unsigned MaxArgAttrs = 0;
   unsigned AttrNum = 0;
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    const CodeGenIntrinsic &intrinsic = Ints[i];
-    maxArgAttrs =
-        std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
-    unsigned &N = UniqAttributes[&intrinsic];
+  for (const CodeGenIntrinsic &Int : Ints) {
+    MaxArgAttrs =
+        std::max(MaxArgAttrs, unsigned(Int.ArgumentAttributes.size()));
+    unsigned &N = UniqAttributes[&Int];
     if (N)
       continue;
     N = ++AttrNum;
@@ -539,67 +554,65 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
 
   // Emit an array of AttributeList.  Most intrinsics will have at least one
   // entry, for the function itself (index ~1), which is usually nounwind.
-  OS << "  static constexpr uint16_t IntrinsicsToAttributesMap[] = {\n";
-
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    const CodeGenIntrinsic &intrinsic = Ints[i];
+  OS << "  static constexpr uint16_t IntrinsicsToAttributesMap[] = {";
+  for (const CodeGenIntrinsic &Int : Ints)
+    OS << formatv("\n    {0}, // {1}", UniqAttributes[&Int], Int.Name);
 
-    OS << "    " << UniqAttributes[&intrinsic] << ", // " << intrinsic.Name
-       << "\n";
-  }
-  OS << "  };\n\n";
-
-  OS << "  std::pair<unsigned, AttributeSet> AS[" << maxArgAttrs + 1 << "];\n";
-  OS << "  unsigned NumAttrs = 0;\n";
-  OS << "  if (id != 0) {\n";
-  OS << "    switch(IntrinsicsToAttributesMap[id - 1]) {\n";
-  OS << "    default: llvm_unreachable(\"Invalid attribute number\");\n";
-  for (auto UniqAttribute : UniqAttributes) {
-    OS << "    case " << UniqAttribute.second << ": {\n";
+  OS << formatv(R"(
+  };
+  std::pair<unsigned, AttributeSet> AS[{0}];
+  unsigned NumAttrs = 0;
+  if (id != 0) {{
+    switch(IntrinsicsToAttributesMap[id - 1]) {{
+      default: llvm_unreachable("Invalid attribute number");
+)",
+                MaxArgAttrs + 1);
 
-    const CodeGenIntrinsic &Intrinsic = *(UniqAttribute.first);
+  for (const auto [IntPtr, UniqueID] : UniqAttributes) {
+    OS << formatv("    case {0}:\n", UniqueID);
+    const CodeGenIntrinsic &Int = *IntPtr;
 
     // Keep track of the number of attributes we're writing out.
-    unsigned numAttrs = 0;
+    unsigned NumAttrs = 0;
 
-    for (const auto &[AttrIdx, Attrs] :
-         enumerate(Intrinsic.ArgumentAttributes)) {
+    for (const auto &[AttrIdx, Attrs] : enumerate(Int.ArgumentAttributes)) {
       if (Attrs.empty())
         continue;
 
-      unsigned ID = UniqArgAttributes.find(Attrs)->second;
-      OS << "      AS[" << numAttrs++ << "] = {" << AttrIdx
-         << ", getIntrinsicArgAttributeSet(C, " << ID << ")};\n";
+      unsigned ArgAttrID = UniqArgAttributes.find(Attrs)->second;
+      OS << formatv(
+          "      AS[{0}] = {{{1}, getIntrinsicArgAttributeSet(C, {2})};\n",
+          NumAttrs++, AttrIdx, ArgAttrID);
     }
 
-    if (!Intrinsic.canThrow ||
-        (Intrinsic.ME != MemoryEffects::unknown() &&
-         !Intrinsic.hasSideEffects) ||
-        Intrinsic.isNoReturn || Intrinsic.isNoCallback || Intrinsic.isNoSync ||
-        Intrinsic.isNoFree || Intrinsic.isWillReturn || Intrinsic.isCold ||
-        Intrinsic.isNoDuplicate || Intrinsic.isNoMerge ||
-        Intrinsic.isConvergent || Intrinsic.isSpeculatable ||
-        Intrinsic.isStrictFP) {
-      unsigned ID = UniqFnAttributes.find(&Intrinsic)->second;
-      OS << "      AS[" << numAttrs++ << "] = {AttributeList::FunctionIndex, "
-         << "getIntrinsicFnAttributeSet(C, " << ID << ")};\n";
+    if (!Int.canThrow ||
+        (Int.ME != MemoryEffects::unknown() && !Int.hasSideEffects) ||
+        Int.isNoReturn || Int.isNoCallback || Int.isNoSync || Int.isNoFree ||
+        Int.isWillReturn || Int.isCold || Int.isNoDuplicate || Int.isNoMerge ||
+        Int.isConvergent || Int.isSpeculatable || Int.isStrictFP) {
+      unsigned FnAttrID = UniqFnAttributes.find(&Int)->second;
+      OS << formatv("      AS[{0}] = {{AttributeList::FunctionIndex, "
+                    "getIntrinsicFnAttributeSet(C, {1})};\n",
+                    NumAttrs++, FnAttrID);
     }
 
-    if (numAttrs) {
-      OS << "      NumAttrs = " << numAttrs << ";\n";
-      OS << "      break;\n";
-      OS << "    }\n";
+    if (NumAttrs) {
+      OS << formatv(R"(      NumAttrs = {0};
+      break;
+)",
+                    NumAttrs);
     } else {
       OS << "      return AttributeList();\n";
-      OS << "    }\n";
     }
   }
 
-  OS << "    }\n";
-  OS << "  }\n";
-  OS << "  return AttributeList::get(C, ArrayRef(AS, NumAttrs));\n";
-  OS << "}\n";
-  OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
+  OS << R"(    }
+  }
+  return AttributeList::get(C, ArrayRef(AS, NumAttrs));
+}
+#endif // GET_INTRINSIC_ATTRIBUTES
+
+)";
 }
 
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(

From 4ee2ad259812159c4f51bf2d8edcf0376302b2c3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 29 Aug 2024 19:36:24 +0400
Subject: [PATCH 05/72] AArch64: Add tests for atomicrmw fp operations
 (#103701)

There were only codegen tests for the fadd vector case,
so round out the test coverage for the scalar cases
and all the other operations.
---
 .../AArch64/atomicrmw-fadd-fp-vector.ll       |  115 --
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll   | 1209 ++++++++++++++++
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll   | 1272 +++++++++++++++++
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll   | 1272 +++++++++++++++++
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll   | 1209 ++++++++++++++++
 5 files changed, 4962 insertions(+), 115 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll

diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
deleted file mode 100644
index a7539ac3cce802..00000000000000
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s
-; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s
-
-define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
-; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
-; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
-; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NOLSE-NEXT:    ret
-;
-; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
-; LSE:       // %bb.0:
-; LSE-NEXT:    fcvtl v1.4s, v0.4h
-; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fcvtl v2.4s, v0.4h
-; LSE-NEXT:    fmov w8, s0
-; LSE-NEXT:    mov w10, w8
-; LSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
-; LSE-NEXT:    fcvtn v2.4h, v2.4s
-; LSE-NEXT:    fmov w9, s2
-; LSE-NEXT:    casal w10, w9, [x0]
-; LSE-NEXT:    fmov s0, w10
-; LSE-NEXT:    cmp w10, w8
-; LSE-NEXT:    b.ne .LBB0_1
-; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; LSE-NEXT:    ret
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
-  ret <2 x half> %res
-}
-
-define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
-; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
-; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB1_2
-; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; NOLSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
-; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; NOLSE-NEXT:    fmov d0, d1
-; NOLSE-NEXT:    ret
-;
-; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
-; LSE:       // %bb.0:
-; LSE-NEXT:    ldr d1, [x0]
-; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
-; LSE-NEXT:    fmov x8, d1
-; LSE-NEXT:    mov x10, x8
-; LSE-NEXT:    fmov x9, d2
-; LSE-NEXT:    casal x10, x9, [x0]
-; LSE-NEXT:    fmov d1, x10
-; LSE-NEXT:    cmp x10, x8
-; LSE-NEXT:    b.ne .LBB1_1
-; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    fmov d0, d1
-; LSE-NEXT:    ret
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
-  ret <2 x float> %res
-}
-
-attributes #0 = { nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
new file mode 100644
index 00000000000000..89c9880ffc7868
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -0,0 +1,1209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
+
+define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB0_2
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fadd s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fadd s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB0_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2
+  ret half %res
+}
+
+define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB1_2
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fadd s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fadd s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB1_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4
+  ret half %res
+}
+
+define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB2_2
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB2_5
+; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fadd s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB2_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; NOLSE-NEXT:    b .LBB2_1
+; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fadd s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB2_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2
+  ret bfloat %res
+}
+
+define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB3_2
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB3_5
+; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fadd s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB3_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; NOLSE-NEXT:    b .LBB3_1
+; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fadd s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB3_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4
+  ret bfloat %res
+}
+
+define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr s1, [x0]
+; NOLSE-NEXT:    b .LBB4_2
+; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; NOLSE-NEXT:    fmov s1, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB4_5
+; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    fadd s2, s1, s0
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB4_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; NOLSE-NEXT:    b .LBB4_1
+; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, s1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr s1, [x0]
+; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fadd s2, s1, s0
+; LSE-NEXT:    fmov w8, s1
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s1, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB4_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov s0, s1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4
+  ret float %res
+}
+
+define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB5_2
+; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB5_5
+; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    fadd d2, d1, d0
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB5_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; NOLSE-NEXT:    b .LBB5_1
+; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fadd d2, d1, d0
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB5_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    b .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl __adddf3
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8
+  ret double %res
+}
+
+define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    sub sp, sp, #96
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT:    mov x19, x0
+; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NOLSE-NEXT:    b .LBB6_2
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; NOLSE-NEXT:    stp x12, x13, [sp, #32]
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    ldr q1, [sp, #32]
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    b.eq .LBB6_6
+; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    bl __addtf3
+; NOLSE-NEXT:    str q0, [sp, #48]
+; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
+; NOLSE-NEXT:    str q0, [sp, #64]
+; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
+; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x19]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB6_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    add sp, sp, #96
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16:
+; LSE:       // %bb.0:
+; LSE-NEXT:    sub sp, sp, #96
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; LSE-NEXT:    mov x19, x0
+; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; LSE-NEXT:    bl __addtf3
+; LSE-NEXT:    str q0, [sp, #48]
+; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; LSE-NEXT:    ldp x0, x1, [sp, #48]
+; LSE-NEXT:    str q0, [sp, #64]
+; LSE-NEXT:    ldp x2, x3, [sp, #64]
+; LSE-NEXT:    mov x4, x2
+; LSE-NEXT:    mov x5, x3
+; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
+; LSE-NEXT:    stp x4, x5, [sp, #32]
+; LSE-NEXT:    cmp x5, x3
+; LSE-NEXT:    ldr q1, [sp, #32]
+; LSE-NEXT:    ccmp x4, x2, #0, eq
+; LSE-NEXT:    b.ne .LBB6_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    add sp, sp, #96
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x2, x21
+; SOFTFP-NOLSE-NEXT:    mov x3, x19
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    bl __addtf3
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:    mov x9, x1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w10, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
+; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, fp128 %value seq_cst, align 16
+  ret fp128 %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB7_5
+; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
+; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB7_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; NOLSE-NEXT:    b .LBB7_1
+; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvtl v1.4s, v0.4h
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvtl v2.4s, v0.4h
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
+; LSE-NEXT:    fcvtn v2.4h, v2.4s
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w23, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w24
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w25, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w25
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    movi v1.4s, #1
+; NOLSE-NEXT:    movi v2.4s, #127, msl #8
+; NOLSE-NEXT:    shll v3.4s, v0.4h, #16
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB8_5
+; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; NOLSE-NEXT:    shll v4.4s, v0.4h, #16
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fadd v4.4s, v4.4s, v3.4s
+; NOLSE-NEXT:    ushr v5.4s, v4.4s, #16
+; NOLSE-NEXT:    and v5.16b, v5.16b, v1.16b
+; NOLSE-NEXT:    add v4.4s, v5.4s, v4.4s
+; NOLSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
+; NOLSE-NEXT:    fmov w8, s4
+; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB8_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; NOLSE-NEXT:    b .LBB8_1
+; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    movi v1.4s, #1
+; LSE-NEXT:    movi v2.4s, #127, msl #8
+; LSE-NEXT:    shll v3.4s, v0.4h, #16
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    fadd v4.4s, v4.4s, v3.4s
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    ushr v5.4s, v4.4s, #16
+; LSE-NEXT:    and v5.16b, v5.16b, v1.16b
+; LSE-NEXT:    add v4.4s, v5.4s, v4.4s
+; LSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
+; LSE-NEXT:    fmov w9, s4
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w8, w1
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value seq_cst, align 4
+  ret <2 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB9_2
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB9_5
+; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB9_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; NOLSE-NEXT:    b .LBB9_1
+; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w19
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __addsf3
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    b .LBB10_2
+; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x12
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    mov v1.d[1], x13
+; NOLSE-NEXT:    b.eq .LBB10_6
+; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    fadd v2.2d, v1.2d, v0.2d
+; NOLSE-NEXT:    mov x9, v1.d[1]
+; NOLSE-NEXT:    fmov x11, d1
+; NOLSE-NEXT:    mov x8, v2.d[1]
+; NOLSE-NEXT:    fmov x10, d2
+; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x0]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB10_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fadd v2.2d, v1.2d, v0.2d
+; LSE-NEXT:    mov x3, v1.d[1]
+; LSE-NEXT:    fmov x2, d1
+; LSE-NEXT:    mov x7, x3
+; LSE-NEXT:    mov x5, v2.d[1]
+; LSE-NEXT:    mov x6, x2
+; LSE-NEXT:    fmov x4, d2
+; LSE-NEXT:    caspal x6, x7, x4, x5, [x0]
+; LSE-NEXT:    fmov d1, x6
+; LSE-NEXT:    cmp x7, x3
+; LSE-NEXT:    ccmp x6, x2, #0, eq
+; LSE-NEXT:    mov v1.d[1], x7
+; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB10_2
+; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
+; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x1
+; SOFTFP-NOLSE-NEXT:    mov x1, x19
+; SOFTFP-NOLSE-NEXT:    bl __adddf3
+; SOFTFP-NOLSE-NEXT:    mov x24, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x23
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl __adddf3
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w9, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fadd ptr %ptr, <2 x double> %value seq_cst, align 16
+  ret <2 x double> %res
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
new file mode 100644
index 00000000000000..998d8ae0c1de4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -0,0 +1,1272 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
+
+; FIXME: Windows hosts assigns stack slots to different offsets for some reason.
+; UNSUPPORTED: system-windows
+
+define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB0_2
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmaxnm s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmaxnm s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB0_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2
+  ret half %res
+}
+
+define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB1_2
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmaxnm s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmaxnm s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB1_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4
+  ret half %res
+}
+
+define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB2_2
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB2_5
+; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmaxnm s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB2_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; NOLSE-NEXT:    b .LBB2_1
+; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmaxnm s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB2_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2
+  ret bfloat %res
+}
+
+define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB3_2
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB3_5
+; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmaxnm s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB3_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; NOLSE-NEXT:    b .LBB3_1
+; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmaxnm s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB3_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4
+  ret bfloat %res
+}
+
+define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr s1, [x0]
+; NOLSE-NEXT:    b .LBB4_2
+; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; NOLSE-NEXT:    fmov s1, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB4_5
+; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    fmaxnm s2, s1, s0
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB4_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; NOLSE-NEXT:    b .LBB4_1
+; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, s1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr s1, [x0]
+; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmaxnm s2, s1, s0
+; LSE-NEXT:    fmov w8, s1
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s1, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB4_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov s0, s1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4
+  ret float %res
+}
+
+define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB5_2
+; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB5_5
+; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    fmaxnm d2, d1, d0
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB5_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; NOLSE-NEXT:    b .LBB5_1
+; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmaxnm d2, d1, d0
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB5_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    b .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl fmax
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8
+  ret double %res
+}
+
+define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    sub sp, sp, #96
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT:    mov x19, x0
+; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NOLSE-NEXT:    b .LBB6_2
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; NOLSE-NEXT:    stp x12, x13, [sp, #32]
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    ldr q1, [sp, #32]
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    b.eq .LBB6_6
+; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    bl fmaxl
+; NOLSE-NEXT:    str q0, [sp, #48]
+; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
+; NOLSE-NEXT:    str q0, [sp, #64]
+; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
+; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x19]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB6_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    add sp, sp, #96
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
+; LSE:       // %bb.0:
+; LSE-NEXT:    sub sp, sp, #96
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; LSE-NEXT:    mov x19, x0
+; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; LSE-NEXT:    bl fmaxl
+; LSE-NEXT:    str q0, [sp, #48]
+; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; LSE-NEXT:    ldp x0, x1, [sp, #48]
+; LSE-NEXT:    str q0, [sp, #64]
+; LSE-NEXT:    ldp x2, x3, [sp, #64]
+; LSE-NEXT:    mov x4, x2
+; LSE-NEXT:    mov x5, x3
+; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
+; LSE-NEXT:    stp x4, x5, [sp, #32]
+; LSE-NEXT:    cmp x5, x3
+; LSE-NEXT:    ldr q1, [sp, #32]
+; LSE-NEXT:    ccmp x4, x2, #0, eq
+; LSE-NEXT:    b.ne .LBB6_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    add sp, sp, #96
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x2, x21
+; SOFTFP-NOLSE-NEXT:    mov x3, x19
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    bl fmaxl
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:    mov x9, x1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w10, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
+; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, fp128 %value seq_cst, align 16
+  ret fp128 %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOLSE-NEXT:    mov h1, v0.h[1]
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    fcvt s1, h1
+; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB7_5
+; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; NOLSE-NEXT:    mov h3, v0.h[1]
+; NOLSE-NEXT:    fcvt s4, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fcvt s3, h3
+; NOLSE-NEXT:    fmaxnm s4, s4, s2
+; NOLSE-NEXT:    fmaxnm s3, s3, s1
+; NOLSE-NEXT:    fcvt h4, s4
+; NOLSE-NEXT:    fcvt h3, s3
+; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w8, s4
+; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB7_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; NOLSE-NEXT:    b .LBB7_1
+; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; LSE-NEXT:    mov h1, v0.h[1]
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:    fcvt s1, h1
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov h3, v0.h[1]
+; LSE-NEXT:    fcvt s4, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fcvt s3, h3
+; LSE-NEXT:    fmaxnm s4, s4, s2
+; LSE-NEXT:    fmaxnm s3, s3, s1
+; LSE-NEXT:    fcvt h4, s4
+; LSE-NEXT:    fcvt h3, s3
+; LSE-NEXT:    mov v4.h[1], v3.h[0]
+; LSE-NEXT:    fmov w9, s4
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w23, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w24
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w25, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w25
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOLSE-NEXT:    mov h1, v0.h[1]
+; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    lsl w10, w10, #16
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9
+; NOLSE-NEXT:    b.eq .LBB8_5
+; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; NOLSE-NEXT:    mov h3, v0.h[1]
+; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    lsl w10, w10, #16
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    fmov s4, w10
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmaxnm s4, s4, s2
+; NOLSE-NEXT:    fmov s3, w9
+; NOLSE-NEXT:    fmaxnm s3, s3, s1
+; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:    ubfx w12, w10, #16, #1
+; NOLSE-NEXT:    add w10, w10, w8
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    add w10, w12, w10
+; NOLSE-NEXT:    lsr w10, w10, #16
+; NOLSE-NEXT:    ubfx w11, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    fmov s4, w10
+; NOLSE-NEXT:    add w9, w11, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s3, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9
+; NOLSE-NEXT:    b.ne .LBB8_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; NOLSE-NEXT:    b .LBB8_1
+; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; LSE-NEXT:    mov h1, v0.h[1]
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:    lsl w10, w10, #16
+; LSE-NEXT:    fmov w9, s1
+; LSE-NEXT:    fmov s2, w10
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov h3, v0.h[1]
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsl w10, w10, #16
+; LSE-NEXT:    fmov w9, s3
+; LSE-NEXT:    fmov s4, w10
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmaxnm s4, s4, s2
+; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    fmaxnm s3, s3, s1
+; LSE-NEXT:    fmov w10, s4
+; LSE-NEXT:    ubfx w12, w10, #16, #1
+; LSE-NEXT:    add w10, w10, w8
+; LSE-NEXT:    fmov w9, s3
+; LSE-NEXT:    add w10, w12, w10
+; LSE-NEXT:    lsr w10, w10, #16
+; LSE-NEXT:    ubfx w11, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    fmov s4, w10
+; LSE-NEXT:    add w9, w11, w9
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov v4.h[1], v3.h[0]
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    fmov w10, s4
+; LSE-NEXT:    casal w11, w10, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w9
+; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w8, w1
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, <2 x bfloat> %value seq_cst, align 4
+  ret <2 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB9_2
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB9_5
+; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    fmaxnm v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB9_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; NOLSE-NEXT:    b .LBB9_1
+; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmaxnm v2.2s, v1.2s, v0.2s
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w19
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fmaxf
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    b .LBB10_2
+; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x12
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    mov v1.d[1], x13
+; NOLSE-NEXT:    b.eq .LBB10_6
+; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    fmaxnm v2.2d, v1.2d, v0.2d
+; NOLSE-NEXT:    mov x9, v1.d[1]
+; NOLSE-NEXT:    fmov x11, d1
+; NOLSE-NEXT:    mov x8, v2.d[1]
+; NOLSE-NEXT:    fmov x10, d2
+; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x0]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB10_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmaxnm v2.2d, v1.2d, v0.2d
+; LSE-NEXT:    mov x3, v1.d[1]
+; LSE-NEXT:    fmov x2, d1
+; LSE-NEXT:    mov x7, x3
+; LSE-NEXT:    mov x5, v2.d[1]
+; LSE-NEXT:    mov x6, x2
+; LSE-NEXT:    fmov x4, d2
+; LSE-NEXT:    caspal x6, x7, x4, x5, [x0]
+; LSE-NEXT:    fmov d1, x6
+; LSE-NEXT:    cmp x7, x3
+; LSE-NEXT:    ccmp x6, x2, #0, eq
+; LSE-NEXT:    mov v1.d[1], x7
+; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB10_2
+; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
+; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x1
+; SOFTFP-NOLSE-NEXT:    mov x1, x19
+; SOFTFP-NOLSE-NEXT:    bl fmax
+; SOFTFP-NOLSE-NEXT:    mov x24, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x23
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl fmax
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w9, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmax ptr %ptr, <2 x double> %value seq_cst, align 16
+  ret <2 x double> %res
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
new file mode 100644
index 00000000000000..2697dbf5b2191d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -0,0 +1,1272 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
+
+; FIXME: Windows hosts assigns stack slots to different offsets for some reason.
+; UNSUPPORTED: system-windows
+
+define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB0_2
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fminnm s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fminnm s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB0_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2
+  ret half %res
+}
+
+define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB1_2
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fminnm s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fminnm s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB1_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4
+  ret half %res
+}
+
+define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB2_2
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB2_5
+; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fminnm s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB2_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; NOLSE-NEXT:    b .LBB2_1
+; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fminnm s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB2_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2
+  ret bfloat %res
+}
+
+define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB3_2
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB3_5
+; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fminnm s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB3_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; NOLSE-NEXT:    b .LBB3_1
+; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fminnm s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB3_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4
+  ret bfloat %res
+}
+
+define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr s1, [x0]
+; NOLSE-NEXT:    b .LBB4_2
+; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; NOLSE-NEXT:    fmov s1, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB4_5
+; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    fminnm s2, s1, s0
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB4_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; NOLSE-NEXT:    b .LBB4_1
+; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, s1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr s1, [x0]
+; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fminnm s2, s1, s0
+; LSE-NEXT:    fmov w8, s1
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s1, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB4_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov s0, s1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4
+  ret float %res
+}
+
+define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB5_2
+; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB5_5
+; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    fminnm d2, d1, d0
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB5_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; NOLSE-NEXT:    b .LBB5_1
+; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fminnm d2, d1, d0
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB5_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    b .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl fmin
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8
+  ret double %res
+}
+
+define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    sub sp, sp, #96
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT:    mov x19, x0
+; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NOLSE-NEXT:    b .LBB6_2
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; NOLSE-NEXT:    stp x12, x13, [sp, #32]
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    ldr q1, [sp, #32]
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    b.eq .LBB6_6
+; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    bl fminl
+; NOLSE-NEXT:    str q0, [sp, #48]
+; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
+; NOLSE-NEXT:    str q0, [sp, #64]
+; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
+; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x19]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB6_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    add sp, sp, #96
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
+; LSE:       // %bb.0:
+; LSE-NEXT:    sub sp, sp, #96
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; LSE-NEXT:    mov x19, x0
+; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; LSE-NEXT:    bl fminl
+; LSE-NEXT:    str q0, [sp, #48]
+; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; LSE-NEXT:    ldp x0, x1, [sp, #48]
+; LSE-NEXT:    str q0, [sp, #64]
+; LSE-NEXT:    ldp x2, x3, [sp, #64]
+; LSE-NEXT:    mov x4, x2
+; LSE-NEXT:    mov x5, x3
+; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
+; LSE-NEXT:    stp x4, x5, [sp, #32]
+; LSE-NEXT:    cmp x5, x3
+; LSE-NEXT:    ldr q1, [sp, #32]
+; LSE-NEXT:    ccmp x4, x2, #0, eq
+; LSE-NEXT:    b.ne .LBB6_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    add sp, sp, #96
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x2, x21
+; SOFTFP-NOLSE-NEXT:    mov x3, x19
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    bl fminl
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:    mov x9, x1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w10, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
+; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, fp128 %value seq_cst, align 16
+  ret fp128 %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOLSE-NEXT:    mov h1, v0.h[1]
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    fcvt s1, h1
+; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB7_5
+; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; NOLSE-NEXT:    mov h3, v0.h[1]
+; NOLSE-NEXT:    fcvt s4, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fcvt s3, h3
+; NOLSE-NEXT:    fminnm s4, s4, s2
+; NOLSE-NEXT:    fminnm s3, s3, s1
+; NOLSE-NEXT:    fcvt h4, s4
+; NOLSE-NEXT:    fcvt h3, s3
+; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w8, s4
+; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB7_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; NOLSE-NEXT:    b .LBB7_1
+; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; LSE-NEXT:    mov h1, v0.h[1]
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:    fcvt s1, h1
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov h3, v0.h[1]
+; LSE-NEXT:    fcvt s4, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fcvt s3, h3
+; LSE-NEXT:    fminnm s4, s4, s2
+; LSE-NEXT:    fminnm s3, s3, s1
+; LSE-NEXT:    fcvt h4, s4
+; LSE-NEXT:    fcvt h3, s3
+; LSE-NEXT:    mov v4.h[1], v3.h[0]
+; LSE-NEXT:    fmov w9, s4
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w23, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w24
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w25, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w25
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOLSE-NEXT:    mov h1, v0.h[1]
+; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    lsl w10, w10, #16
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9
+; NOLSE-NEXT:    b.eq .LBB8_5
+; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; NOLSE-NEXT:    mov h3, v0.h[1]
+; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    lsl w10, w10, #16
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    fmov s4, w10
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fminnm s4, s4, s2
+; NOLSE-NEXT:    fmov s3, w9
+; NOLSE-NEXT:    fminnm s3, s3, s1
+; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:    ubfx w12, w10, #16, #1
+; NOLSE-NEXT:    add w10, w10, w8
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    add w10, w12, w10
+; NOLSE-NEXT:    lsr w10, w10, #16
+; NOLSE-NEXT:    ubfx w11, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    fmov s4, w10
+; NOLSE-NEXT:    add w9, w11, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s3, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9
+; NOLSE-NEXT:    b.ne .LBB8_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; NOLSE-NEXT:    b .LBB8_1
+; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; LSE-NEXT:    mov h1, v0.h[1]
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:    lsl w10, w10, #16
+; LSE-NEXT:    fmov w9, s1
+; LSE-NEXT:    fmov s2, w10
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov h3, v0.h[1]
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsl w10, w10, #16
+; LSE-NEXT:    fmov w9, s3
+; LSE-NEXT:    fmov s4, w10
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fminnm s4, s4, s2
+; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    fminnm s3, s3, s1
+; LSE-NEXT:    fmov w10, s4
+; LSE-NEXT:    ubfx w12, w10, #16, #1
+; LSE-NEXT:    add w10, w10, w8
+; LSE-NEXT:    fmov w9, s3
+; LSE-NEXT:    add w10, w12, w10
+; LSE-NEXT:    lsr w10, w10, #16
+; LSE-NEXT:    ubfx w11, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    fmov s4, w10
+; LSE-NEXT:    add w9, w11, w9
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov v4.h[1], v3.h[0]
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    fmov w10, s4
+; LSE-NEXT:    casal w11, w10, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w9
+; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w8, w1
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, <2 x bfloat> %value seq_cst, align 4
+  ret <2 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB9_2
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB9_5
+; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    fminnm v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB9_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; NOLSE-NEXT:    b .LBB9_1
+; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fminnm v2.2s, v1.2s, v0.2s
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w19
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl fminf
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    b .LBB10_2
+; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x12
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    mov v1.d[1], x13
+; NOLSE-NEXT:    b.eq .LBB10_6
+; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    fminnm v2.2d, v1.2d, v0.2d
+; NOLSE-NEXT:    mov x9, v1.d[1]
+; NOLSE-NEXT:    fmov x11, d1
+; NOLSE-NEXT:    mov x8, v2.d[1]
+; NOLSE-NEXT:    fmov x10, d2
+; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x0]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB10_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fminnm v2.2d, v1.2d, v0.2d
+; LSE-NEXT:    mov x3, v1.d[1]
+; LSE-NEXT:    fmov x2, d1
+; LSE-NEXT:    mov x7, x3
+; LSE-NEXT:    mov x5, v2.d[1]
+; LSE-NEXT:    mov x6, x2
+; LSE-NEXT:    fmov x4, d2
+; LSE-NEXT:    caspal x6, x7, x4, x5, [x0]
+; LSE-NEXT:    fmov d1, x6
+; LSE-NEXT:    cmp x7, x3
+; LSE-NEXT:    ccmp x6, x2, #0, eq
+; LSE-NEXT:    mov v1.d[1], x7
+; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB10_2
+; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
+; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x1
+; SOFTFP-NOLSE-NEXT:    mov x1, x19
+; SOFTFP-NOLSE-NEXT:    bl fmin
+; SOFTFP-NOLSE-NEXT:    mov x24, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x23
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl fmin
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w9, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fmin ptr %ptr, <2 x double> %value seq_cst, align 16
+  ret <2 x double> %res
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
new file mode 100644
index 00000000000000..f41ddcb81d5ca5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -0,0 +1,1209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
+
+define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB0_2
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fsub s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fsub s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB0_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2
+  ret half %res
+}
+
+define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvt s1, h0
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    b .LBB1_2
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    fcvt s2, h0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fsub s2, s2, s1
+; NOLSE-NEXT:    fcvt h2, s2
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvt s1, h0
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvt s2, h0
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fsub s2, s2, s1
+; LSE-NEXT:    fcvt h2, s2
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casalh w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8, uxth
+; LSE-NEXT:    b.ne .LBB1_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4
+  ret half %res
+}
+
+define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB2_2
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB2_5
+; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fsub s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB2_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; NOLSE-NEXT:    b .LBB2_1
+; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fsub s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB2_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2
+  ret bfloat %res
+}
+
+define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
+; NOLSE-NEXT:    ldr h0, [x0]
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s1, w9
+; NOLSE-NEXT:    b .LBB3_2
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w11
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.eq .LBB3_5
+; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    lsl w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fsub s2, s2, s1
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    ubfx w10, w9, #16, #1
+; NOLSE-NEXT:    add w9, w9, w8
+; NOLSE-NEXT:    add w9, w10, w9
+; NOLSE-NEXT:    lsr w9, w9, #16
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w11, [x0]
+; NOLSE-NEXT:    cmp w11, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB3_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; NOLSE-NEXT:    b .LBB3_1
+; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    mov w8, #32767 // =0x7fff
+; LSE-NEXT:    ldr h0, [x0]
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fsub s2, s2, s1
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    ubfx w10, w9, #16, #1
+; LSE-NEXT:    add w9, w9, w8
+; LSE-NEXT:    add w9, w10, w9
+; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    lsr w9, w9, #16
+; LSE-NEXT:    mov w11, w10
+; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s0, w11
+; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    b.ne .LBB3_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4
+  ret bfloat %res
+}
+
+define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr s1, [x0]
+; NOLSE-NEXT:    b .LBB4_2
+; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; NOLSE-NEXT:    fmov s1, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB4_5
+; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    fsub s2, s1, s0
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB4_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; NOLSE-NEXT:    b .LBB4_1
+; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov s0, s1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr s1, [x0]
+; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fsub s2, s1, s0
+; LSE-NEXT:    fmov w8, s1
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s1, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB4_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov s0, s1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    b .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4
+  ret float %res
+}
+
+define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB5_2
+; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB5_5
+; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    fsub d2, d1, d0
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB5_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; NOLSE-NEXT:    b .LBB5_1
+; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fsub d2, d1, d0
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB5_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    b .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl __subdf3
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8
+  ret double %res
+}
+
+define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    sub sp, sp, #96
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT:    mov x19, x0
+; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NOLSE-NEXT:    b .LBB6_2
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; NOLSE-NEXT:    stp x12, x13, [sp, #32]
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    ldr q1, [sp, #32]
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    b.eq .LBB6_6
+; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    bl __subtf3
+; NOLSE-NEXT:    str q0, [sp, #48]
+; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
+; NOLSE-NEXT:    str q0, [sp, #64]
+; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
+; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x19]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x10
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB6_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
+; NOLSE-NEXT:    cbnz w14, .LBB6_3
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    add sp, sp, #96
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16:
+; LSE:       // %bb.0:
+; LSE-NEXT:    sub sp, sp, #96
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; LSE-NEXT:    mov x19, x0
+; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; LSE-NEXT:    bl __subtf3
+; LSE-NEXT:    str q0, [sp, #48]
+; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; LSE-NEXT:    ldp x0, x1, [sp, #48]
+; LSE-NEXT:    str q0, [sp, #64]
+; LSE-NEXT:    ldp x2, x3, [sp, #64]
+; LSE-NEXT:    mov x4, x2
+; LSE-NEXT:    mov x5, x3
+; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
+; LSE-NEXT:    stp x4, x5, [sp, #32]
+; LSE-NEXT:    cmp x5, x3
+; LSE-NEXT:    ldr q1, [sp, #32]
+; LSE-NEXT:    ccmp x4, x2, #0, eq
+; LSE-NEXT:    b.ne .LBB6_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    add sp, sp, #96
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x2, x21
+; SOFTFP-NOLSE-NEXT:    mov x3, x19
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    bl __subtf3
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:    mov x9, x1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w10, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
+; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, fp128 %value seq_cst, align 16
+  ret fp128 %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB7_5
+; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fsub v2.4s, v2.4s, v1.4s
+; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB7_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; NOLSE-NEXT:    b .LBB7_1
+; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvtl v1.4s, v0.4h
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvtl v2.4s, v0.4h
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fsub v2.4s, v2.4s, v1.4s
+; LSE-NEXT:    fcvtn v2.4h, v2.4s
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w23, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w24
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w25, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w25
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    movi v1.4s, #1
+; NOLSE-NEXT:    movi v2.4s, #127, msl #8
+; NOLSE-NEXT:    shll v3.4s, v0.4h, #16
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB8_5
+; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; NOLSE-NEXT:    shll v4.4s, v0.4h, #16
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fsub v4.4s, v4.4s, v3.4s
+; NOLSE-NEXT:    ushr v5.4s, v4.4s, #16
+; NOLSE-NEXT:    and v5.16b, v5.16b, v1.16b
+; NOLSE-NEXT:    add v4.4s, v5.4s, v4.4s
+; NOLSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
+; NOLSE-NEXT:    fmov w8, s4
+; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB8_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; NOLSE-NEXT:    b .LBB8_1
+; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    movi v1.4s, #1
+; LSE-NEXT:    movi v2.4s, #127, msl #8
+; LSE-NEXT:    shll v3.4s, v0.4h, #16
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    fsub v4.4s, v4.4s, v3.4s
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    ushr v5.4s, v4.4s, #16
+; LSE-NEXT:    and v5.16b, v5.16b, v1.16b
+; LSE-NEXT:    add v4.4s, v5.4s, v4.4s
+; LSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
+; LSE-NEXT:    fmov w9, s4
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w8, w1
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, <2 x bfloat> %value seq_cst, align 4
+  ret <2 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB9_2
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB9_5
+; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    fsub v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB9_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; NOLSE-NEXT:    b .LBB9_1
+; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fsub v2.2s, v1.2s, v0.2s
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w19, w2
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w19
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    mov w24, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    bl __subsf3
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr q1, [x0]
+; NOLSE-NEXT:    b .LBB10_2
+; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x12
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    ccmp x12, x11, #0, eq
+; NOLSE-NEXT:    mov v1.d[1], x13
+; NOLSE-NEXT:    b.eq .LBB10_6
+; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    fsub v2.2d, v1.2d, v0.2d
+; NOLSE-NEXT:    mov x9, v1.d[1]
+; NOLSE-NEXT:    fmov x11, d1
+; NOLSE-NEXT:    mov x8, v2.d[1]
+; NOLSE-NEXT:    fmov x10, d2
+; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x12, x13, [x0]
+; NOLSE-NEXT:    cmp x12, x11
+; NOLSE-NEXT:    cset w14, ne
+; NOLSE-NEXT:    cmp x13, x9
+; NOLSE-NEXT:    cinc w14, w14, ne
+; NOLSE-NEXT:    cbz w14, .LBB10_5
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
+; NOLSE-NEXT:    cbnz w14, .LBB10_3
+; NOLSE-NEXT:    b .LBB10_1
+; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    mov v0.16b, v1.16b
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr q1, [x0]
+; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fsub v2.2d, v1.2d, v0.2d
+; LSE-NEXT:    mov x3, v1.d[1]
+; LSE-NEXT:    fmov x2, d1
+; LSE-NEXT:    mov x7, x3
+; LSE-NEXT:    mov x5, v2.d[1]
+; LSE-NEXT:    mov x6, x2
+; LSE-NEXT:    fmov x4, d2
+; LSE-NEXT:    caspal x6, x7, x4, x5, [x0]
+; LSE-NEXT:    fmov d1, x6
+; LSE-NEXT:    cmp x7, x3
+; LSE-NEXT:    ccmp x6, x2, #0, eq
+; LSE-NEXT:    mov v1.d[1], x7
+; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    mov v0.16b, v1.16b
+; LSE-NEXT:    ret
+;
+; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8:
+; SOFTFP-NOLSE:       // %bb.0:
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x20, x0
+; SOFTFP-NOLSE-NEXT:    mov x19, x3
+; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
+; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov x21, x2
+; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    b .LBB10_2
+; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
+; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    mov x22, x1
+; SOFTFP-NOLSE-NEXT:    mov x23, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x1
+; SOFTFP-NOLSE-NEXT:    mov x1, x19
+; SOFTFP-NOLSE-NEXT:    bl __subdf3
+; SOFTFP-NOLSE-NEXT:    mov x24, x0
+; SOFTFP-NOLSE-NEXT:    mov x0, x23
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    bl __subdf3
+; SOFTFP-NOLSE-NEXT:    mov x8, x0
+; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x0, x23
+; SOFTFP-NOLSE-NEXT:    cset w9, ne
+; SOFTFP-NOLSE-NEXT:    cmp x1, x22
+; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
+; SOFTFP-NOLSE-NEXT:    b .LBB10_1
+; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ret
+  %res = atomicrmw fsub ptr %ptr, <2 x double> %value seq_cst, align 16
+  ret <2 x double> %res
+}
+
+attributes #0 = { nounwind }

From 5048fabb0579f1417f69cde49221b5b9e9c15414 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 29 Aug 2024 08:40:15 -0700
Subject: [PATCH 06/72] [Support] Delete FormatVariadicTest Validate sub-test
 (#106570)

- The subtest, if enabled correctly, will fail with assert in Debug
  builds and validation is disabled in Release builds.
- Hence deleting the test to fix test failures in CI.
---
 llvm/unittests/Support/FormatVariadicTest.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index 4c648d87fc2de7..6ee0d924867419 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -710,16 +710,6 @@ TEST(FormatVariadicTest, FormatFilterRange) {
   EXPECT_EQ("1, 2, 3", formatv("{0}", Range).str());
 }
 
-#ifdef NDEBUG // Disable the test in debug builds where it will assert.
-TEST(FormatVariadicTest, Validate) {
-  std::string Str = formatv("{0}", 1, 2).str();
-  EXPECT_THAT(Str, HasSubstr("Unexpected number of arguments"));
-
-  Str = formatv("{0} {2}", 1, 2, 3).str();
-  EXPECT_THAT(Str, HasSubstr("eplacement indices have holes"));
-}
-#endif // NDEBUG
-
 namespace {
 
 enum class Base { First };

From 26c3a8404f1b3327a0982faeeaee94b08d1ee481 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 29 Aug 2024 19:48:51 +0400
Subject: [PATCH 07/72] AArch64: Use consistent atomicrmw expansion for FP
 operations (#103702)

Use LLSC or cmpxchg in the same cases as for the unsupported
integer operations. This required some fixups to the LLSC
implementatation to deal with the fp128 case.

The comment about floating-point exceptions was wrong,
because floating-point exceptions are not really exceptions at all.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  59 ++-
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll   | 330 ++++------------
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll   | 356 +++++-------------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll   | 356 +++++-------------
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll   | 330 ++++------------
 5 files changed, 376 insertions(+), 1055 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 02390e0a85c0a5..a49dfdb28a41eb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27096,21 +27096,37 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
                              : AtomicExpansionKind::LLSC;
 }
 
+// Return true if the atomic operation expansion will lower to use a library
+// call, and is thus ineligible to use an LLSC expansion.
+static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
+                                   const AtomicRMWInst *RMW) {
+  if (!RMW->isFloatingPointOperation())
+    return false;
+  switch (RMW->getType()->getScalarType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::HalfTyID:
+  case Type::BFloatTyID:
+    // Will use soft float
+    return !Subtarget.hasFPARMv8();
+  default:
+    // fp128 will emit library calls.
+    return true;
+  }
+
+  llvm_unreachable("covered type switch");
+}
+
 // The "default" for integer RMW operations is to expand to an LL/SC loop.
 // However, with the LSE instructions (or outline-atomics mode, which provides
 // library routines in place of the LSE-instructions), we can directly emit many
 // operations instead.
-//
-// Floating-point operations are always emitted to a cmpxchg loop, because they
-// may trigger a trap which aborts an LLSC sequence.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
-  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  Type *Ty = AI->getType();
+  unsigned Size = Ty->getPrimitiveSizeInBits();
   assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
 
-  if (AI->isFloatingPointOperation())
-    return AtomicExpansionKind::CmpXChg;
-
   bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
                       (AI->getOperation() == AtomicRMWInst::Xchg ||
                        AI->getOperation() == AtomicRMWInst::Or ||
@@ -27120,7 +27136,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   // Nand is not supported in LSE.
   // Leave 128 bits to LLSC or CmpXChg.
-  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
+      !AI->isFloatingPointOperation()) {
     if (Subtarget->hasLSE())
       return AtomicExpansionKind::None;
     if (Subtarget->outlineAtomics()) {
@@ -27146,7 +27163,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
   // we have a single CAS instruction that can replace the loop.
   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
-      Subtarget->hasLSE())
+      Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
     return AtomicExpansionKind::CmpXChg;
 
   return AtomicExpansionKind::LLSC;
@@ -27193,10 +27210,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
+
+    auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
+    Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
+    Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
+
+    Value *Or = Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
+    return Builder.CreateBitCast(Or, ValueTy);
   }
 
   Type *Tys[] = { Addr->getType() };
@@ -27207,8 +27228,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
   CallInst *CI = Builder.CreateCall(Ldxr, Addr);
-  CI->addParamAttr(
-      0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+  CI->addParamAttr(0, Attribute::get(Builder.getContext(),
+                                     Attribute::ElementType, IntEltTy));
   Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
 
   return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27234,9 +27255,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
     Function *Stxr = Intrinsic::getDeclaration(M, Int);
     Type *Int64Ty = Type::getInt64Ty(M->getContext());
+    Type *Int128Ty = Type::getInt128Ty(M->getContext());
+
+    Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
 
-    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
+    Value *Hi =
+        Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
   }
 
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index 89c9880ffc7868..0d230bb9dcc6e9 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -7,33 +7,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
 ; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -108,33 +92,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB1_2
 ; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -211,19 +179,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB2_2
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB2_5
-; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fadd s2, s2, s1
@@ -232,21 +193,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB2_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; NOLSE-NEXT:    b .LBB2_1
-; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -325,19 +274,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB3_2
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB3_5
-; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fadd s2, s2, s1
@@ -346,21 +288,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB3_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; NOLSE-NEXT:    b .LBB3_1
-; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -436,31 +366,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr s1, [x0]
-; NOLSE-NEXT:    b .LBB4_2
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB4_5
-; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
 ; NOLSE-NEXT:    fadd s2, s1, s0
-; NOLSE-NEXT:    fmov w9, s1
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB4_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; NOLSE-NEXT:    b .LBB4_1
-; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB4_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov s0, s1
 ; NOLSE-NEXT:    ret
 ;
@@ -523,31 +437,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB5_2
 ; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB5_5
-; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fadd d2, d1, d0
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB5_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; NOLSE-NEXT:    b .LBB5_1
-; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB5_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -748,35 +646,19 @@ define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value)
 define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:    fcvtl v0.4s, v0.4h
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB7_5
-; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
-; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB7_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; NOLSE-NEXT:    b .LBB7_1
-; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
+; NOLSE-NEXT:    fcvtl v1.4s, v1.4h
+; NOLSE-NEXT:    fadd v1.4s, v1.4s, v0.4s
+; NOLSE-NEXT:    fcvtn v1.4h, v1.4s
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
@@ -867,38 +749,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    movi v1.4s, #1
 ; NOLSE-NEXT:    movi v2.4s, #127, msl #8
-; NOLSE-NEXT:    shll v3.4s, v0.4h, #16
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
 ; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB8_5
-; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT:    shll v4.4s, v0.4h, #16
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fadd v4.4s, v4.4s, v3.4s
-; NOLSE-NEXT:    ushr v5.4s, v4.4s, #16
-; NOLSE-NEXT:    and v5.16b, v5.16b, v1.16b
-; NOLSE-NEXT:    add v4.4s, v5.4s, v4.4s
-; NOLSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
-; NOLSE-NEXT:    fmov w8, s4
-; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB8_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; NOLSE-NEXT:    b .LBB8_1
-; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s3, w8
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fadd v3.4s, v3.4s, v0.4s
+; NOLSE-NEXT:    ushr v4.4s, v3.4s, #16
+; NOLSE-NEXT:    and v4.16b, v4.16b, v1.16b
+; NOLSE-NEXT:    add v3.4s, v4.4s, v3.4s
+; NOLSE-NEXT:    addhn v3.4h, v3.4s, v2.4s
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB8_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4:
@@ -984,31 +850,15 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB9_2
 ; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB9_5
-; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB9_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; NOLSE-NEXT:    b .LBB9_1
-; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -1086,43 +936,17 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    b .LBB10_2
 ; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x12
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    mov v1.d[1], x13
-; NOLSE-NEXT:    b.eq .LBB10_6
-; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x8, x9, [x0]
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    mov v1.d[1], x9
 ; NOLSE-NEXT:    fadd v2.2d, v1.2d, v0.2d
-; NOLSE-NEXT:    mov x9, v1.d[1]
-; NOLSE-NEXT:    fmov x11, d1
 ; NOLSE-NEXT:    mov x8, v2.d[1]
-; NOLSE-NEXT:    fmov x10, d2
-; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x0]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB10_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    fmov x9, d2
+; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 998d8ae0c1de4d..12a0c1169f2b6a 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -10,33 +10,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
 ; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -111,33 +95,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB1_2
 ; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -214,19 +182,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB2_2
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB2_5
-; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
@@ -235,21 +196,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB2_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; NOLSE-NEXT:    b .LBB2_1
-; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -328,19 +277,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB3_2
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB3_5
-; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
@@ -349,21 +291,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB3_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; NOLSE-NEXT:    b .LBB3_1
-; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -439,31 +369,15 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr s1, [x0]
-; NOLSE-NEXT:    b .LBB4_2
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB4_5
-; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
 ; NOLSE-NEXT:    fmaxnm s2, s1, s0
-; NOLSE-NEXT:    fmov w9, s1
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB4_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; NOLSE-NEXT:    b .LBB4_1
-; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB4_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov s0, s1
 ; NOLSE-NEXT:    ret
 ;
@@ -526,31 +440,15 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0
 define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB5_2
 ; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB5_5
-; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fmaxnm d2, d1, d0
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB5_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; NOLSE-NEXT:    b .LBB5_1
-; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB5_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -753,41 +651,25 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    fcvt s0, h0
 ; NOLSE-NEXT:    fcvt s1, h1
-; NOLSE-NEXT:    b .LBB7_2
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB7_5
-; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT:    mov h3, v0.h[1]
-; NOLSE-NEXT:    fcvt s4, h0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s2, w8
+; NOLSE-NEXT:    mov h3, v2.h[1]
+; NOLSE-NEXT:    fcvt s2, h2
 ; NOLSE-NEXT:    fcvt s3, h3
-; NOLSE-NEXT:    fmaxnm s4, s4, s2
+; NOLSE-NEXT:    fmaxnm s2, s2, s0
 ; NOLSE-NEXT:    fmaxnm s3, s3, s1
-; NOLSE-NEXT:    fcvt h4, s4
+; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fcvt h3, s3
-; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
-; NOLSE-NEXT:    fmov w8, s4
-; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB7_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; NOLSE-NEXT:    b .LBB7_1
-; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    mov v2.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4:
@@ -888,58 +770,42 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    fmov w10, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr s0, [x0]
 ; NOLSE-NEXT:    lsl w10, w10, #16
 ; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    fmov s1, w10
 ; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9
-; NOLSE-NEXT:    b.eq .LBB8_5
-; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT:    mov h3, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w9, [x0]
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    mov h3, v2.h[1]
+; NOLSE-NEXT:    fmov w11, s2
+; NOLSE-NEXT:    lsl w11, w11, #16
+; NOLSE-NEXT:    fmov w10, s3
+; NOLSE-NEXT:    fmov s3, w11
 ; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s3
-; NOLSE-NEXT:    fmov s4, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmaxnm s4, s4, s2
-; NOLSE-NEXT:    fmov s3, w9
 ; NOLSE-NEXT:    fmaxnm s3, s3, s1
-; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    fmaxnm s2, s2, s0
+; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    ubfx w13, w11, #16, #1
+; NOLSE-NEXT:    add w11, w11, w8
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    add w11, w13, w11
+; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
 ; NOLSE-NEXT:    add w10, w10, w8
-; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    fmov s3, w11
 ; NOLSE-NEXT:    add w10, w12, w10
 ; NOLSE-NEXT:    lsr w10, w10, #16
-; NOLSE-NEXT:    ubfx w11, w9, #16, #1
-; NOLSE-NEXT:    add w9, w9, w8
-; NOLSE-NEXT:    fmov s4, w10
-; NOLSE-NEXT:    add w9, w11, w9
-; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s3, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
-; NOLSE-NEXT:    fmov w10, s4
-; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9
-; NOLSE-NEXT:    b.ne .LBB8_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; NOLSE-NEXT:    b .LBB8_1
-; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    mov v3.h[1], v2.h[0]
+; NOLSE-NEXT:    fmov w10, s3
+; NOLSE-NEXT:    stlxr w11, w10, [x0]
+; NOLSE-NEXT:    cbnz w11, .LBB8_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x9
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
@@ -1047,31 +913,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB9_2
 ; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB9_5
-; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fmaxnm v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB9_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; NOLSE-NEXT:    b .LBB9_1
-; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -1149,43 +999,17 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    b .LBB10_2
 ; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x12
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    mov v1.d[1], x13
-; NOLSE-NEXT:    b.eq .LBB10_6
-; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x8, x9, [x0]
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    mov v1.d[1], x9
 ; NOLSE-NEXT:    fmaxnm v2.2d, v1.2d, v0.2d
-; NOLSE-NEXT:    mov x9, v1.d[1]
-; NOLSE-NEXT:    fmov x11, d1
 ; NOLSE-NEXT:    mov x8, v2.d[1]
-; NOLSE-NEXT:    fmov x10, d2
-; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x0]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB10_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    fmov x9, d2
+; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index 2697dbf5b2191d..71765f435d94cf 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -10,33 +10,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
 ; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -111,33 +95,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB1_2
 ; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -214,19 +182,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB2_2
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB2_5
-; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fminnm s2, s2, s1
@@ -235,21 +196,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB2_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; NOLSE-NEXT:    b .LBB2_1
-; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -328,19 +277,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB3_2
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB3_5
-; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fminnm s2, s2, s1
@@ -349,21 +291,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB3_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; NOLSE-NEXT:    b .LBB3_1
-; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -439,31 +369,15 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr s1, [x0]
-; NOLSE-NEXT:    b .LBB4_2
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB4_5
-; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
 ; NOLSE-NEXT:    fminnm s2, s1, s0
-; NOLSE-NEXT:    fmov w9, s1
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB4_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; NOLSE-NEXT:    b .LBB4_1
-; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB4_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov s0, s1
 ; NOLSE-NEXT:    ret
 ;
@@ -526,31 +440,15 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0
 define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB5_2
 ; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB5_5
-; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fminnm d2, d1, d0
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB5_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; NOLSE-NEXT:    b .LBB5_1
-; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB5_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -753,41 +651,25 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    fcvt s0, h0
 ; NOLSE-NEXT:    fcvt s1, h1
-; NOLSE-NEXT:    b .LBB7_2
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB7_5
-; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT:    mov h3, v0.h[1]
-; NOLSE-NEXT:    fcvt s4, h0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s2, w8
+; NOLSE-NEXT:    mov h3, v2.h[1]
+; NOLSE-NEXT:    fcvt s2, h2
 ; NOLSE-NEXT:    fcvt s3, h3
-; NOLSE-NEXT:    fminnm s4, s4, s2
+; NOLSE-NEXT:    fminnm s2, s2, s0
 ; NOLSE-NEXT:    fminnm s3, s3, s1
-; NOLSE-NEXT:    fcvt h4, s4
+; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fcvt h3, s3
-; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
-; NOLSE-NEXT:    fmov w8, s4
-; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB7_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; NOLSE-NEXT:    b .LBB7_1
-; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    mov v2.h[1], v3.h[0]
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4:
@@ -888,58 +770,42 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    fmov w10, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr s0, [x0]
 ; NOLSE-NEXT:    lsl w10, w10, #16
 ; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    fmov s1, w10
 ; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9
-; NOLSE-NEXT:    b.eq .LBB8_5
-; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT:    mov h3, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w9, [x0]
+; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    mov h3, v2.h[1]
+; NOLSE-NEXT:    fmov w11, s2
+; NOLSE-NEXT:    lsl w11, w11, #16
+; NOLSE-NEXT:    fmov w10, s3
+; NOLSE-NEXT:    fmov s3, w11
 ; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s3
-; NOLSE-NEXT:    fmov s4, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fminnm s4, s4, s2
-; NOLSE-NEXT:    fmov s3, w9
 ; NOLSE-NEXT:    fminnm s3, s3, s1
-; NOLSE-NEXT:    fmov w10, s4
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    fminnm s2, s2, s0
+; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    ubfx w13, w11, #16, #1
+; NOLSE-NEXT:    add w11, w11, w8
+; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    add w11, w13, w11
+; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
 ; NOLSE-NEXT:    add w10, w10, w8
-; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    fmov s3, w11
 ; NOLSE-NEXT:    add w10, w12, w10
 ; NOLSE-NEXT:    lsr w10, w10, #16
-; NOLSE-NEXT:    ubfx w11, w9, #16, #1
-; NOLSE-NEXT:    add w9, w9, w8
-; NOLSE-NEXT:    fmov s4, w10
-; NOLSE-NEXT:    add w9, w11, w9
-; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s3, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    mov v4.h[1], v3.h[0]
-; NOLSE-NEXT:    fmov w10, s4
-; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9
-; NOLSE-NEXT:    b.ne .LBB8_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; NOLSE-NEXT:    b .LBB8_1
-; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    mov v3.h[1], v2.h[0]
+; NOLSE-NEXT:    fmov w10, s3
+; NOLSE-NEXT:    stlxr w11, w10, [x0]
+; NOLSE-NEXT:    cbnz w11, .LBB8_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x9
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
@@ -1047,31 +913,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB9_2
 ; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB9_5
-; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fminnm v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB9_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; NOLSE-NEXT:    b .LBB9_1
-; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -1149,43 +999,17 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    b .LBB10_2
 ; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x12
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    mov v1.d[1], x13
-; NOLSE-NEXT:    b.eq .LBB10_6
-; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x8, x9, [x0]
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    mov v1.d[1], x9
 ; NOLSE-NEXT:    fminnm v2.2d, v1.2d, v0.2d
-; NOLSE-NEXT:    mov x9, v1.d[1]
-; NOLSE-NEXT:    fmov x11, d1
 ; NOLSE-NEXT:    mov x8, v2.d[1]
-; NOLSE-NEXT:    fmov x10, d2
-; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x0]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB10_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    fmov x9, d2
+; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index f41ddcb81d5ca5..67e164037d5ce7 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -7,33 +7,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
 ; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -108,33 +92,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB1_2
 ; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -211,19 +179,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB2_2
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB2_5
-; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fsub s2, s2, s1
@@ -232,21 +193,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB2_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; NOLSE-NEXT:    b .LBB2_1
-; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -325,19 +274,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB3_2
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB3_5
-; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fsub s2, s2, s1
@@ -346,21 +288,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB3_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; NOLSE-NEXT:    b .LBB3_1
-; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -436,31 +366,15 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr s1, [x0]
-; NOLSE-NEXT:    b .LBB4_2
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB4_5
-; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
 ; NOLSE-NEXT:    fsub s2, s1, s0
-; NOLSE-NEXT:    fmov w9, s1
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB4_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; NOLSE-NEXT:    b .LBB4_1
-; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB4_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov s0, s1
 ; NOLSE-NEXT:    ret
 ;
@@ -523,31 +437,15 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0
 define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB5_2
 ; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB5_5
-; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fsub d2, d1, d0
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB5_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; NOLSE-NEXT:    b .LBB5_1
-; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB5_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -748,35 +646,19 @@ define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value)
 define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:    fcvtl v0.4s, v0.4h
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB7_5
-; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fsub v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
-; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB7_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; NOLSE-NEXT:    b .LBB7_1
-; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
+; NOLSE-NEXT:    fcvtl v1.4s, v1.4h
+; NOLSE-NEXT:    fsub v1.4s, v1.4s, v0.4s
+; NOLSE-NEXT:    fcvtn v1.4h, v1.4s
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4:
@@ -867,38 +749,22 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    movi v1.4s, #1
 ; NOLSE-NEXT:    movi v2.4s, #127, msl #8
-; NOLSE-NEXT:    shll v3.4s, v0.4h, #16
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
 ; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB8_5
-; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT:    shll v4.4s, v0.4h, #16
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fsub v4.4s, v4.4s, v3.4s
-; NOLSE-NEXT:    ushr v5.4s, v4.4s, #16
-; NOLSE-NEXT:    and v5.16b, v5.16b, v1.16b
-; NOLSE-NEXT:    add v4.4s, v5.4s, v4.4s
-; NOLSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
-; NOLSE-NEXT:    fmov w8, s4
-; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB8_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; NOLSE-NEXT:    b .LBB8_1
-; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s3, w8
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fsub v3.4s, v3.4s, v0.4s
+; NOLSE-NEXT:    ushr v4.4s, v3.4s, #16
+; NOLSE-NEXT:    and v4.16b, v4.16b, v1.16b
+; NOLSE-NEXT:    add v3.4s, v4.4s, v3.4s
+; NOLSE-NEXT:    addhn v3.4h, v3.4s, v2.4s
+; NOLSE-NEXT:    fmov w9, s3
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB8_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4:
@@ -984,31 +850,15 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB9_2
 ; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB9_5
-; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fsub v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB9_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; NOLSE-NEXT:    b .LBB9_1
-; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -1086,43 +936,17 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    b .LBB10_2
 ; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x12
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    mov v1.d[1], x13
-; NOLSE-NEXT:    b.eq .LBB10_6
-; NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x8, x9, [x0]
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    mov v1.d[1], x9
 ; NOLSE-NEXT:    fsub v2.2d, v1.2d, v0.2d
-; NOLSE-NEXT:    mov x9, v1.d[1]
-; NOLSE-NEXT:    fmov x11, d1
 ; NOLSE-NEXT:    mov x8, v2.d[1]
-; NOLSE-NEXT:    fmov x10, d2
-; NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x0]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x9
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB10_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x10, x8, [x0]
-; NOLSE-NEXT:    cbnz w14, .LBB10_3
-; NOLSE-NEXT:    b .LBB10_1
-; NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; NOLSE-NEXT:    fmov x9, d2
+; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
 ;

From b5a1b45fe321cdf57d1b6155ecbbc18b6f95502f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 08:54:58 -0700
Subject: [PATCH 08/72] [SLP] Early return in getReorderingData [nfc]

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fe57dbfc93d3e7..81811e0a4d9295 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5226,6 +5226,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       !TE.isAltShuffle())
     return TE.ReorderIndices;
   if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
+    if (!TE.ReorderIndices.empty())
+      return TE.ReorderIndices;
+
     auto PHICompare = [&](unsigned I1, unsigned I2) {
       Value *V1 = TE.Scalars[I1];
       Value *V2 = TE.Scalars[I2];
@@ -5259,8 +5262,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
           return false;
       return true;
     };
-    if (!TE.ReorderIndices.empty())
-      return TE.ReorderIndices;
     DenseMap<unsigned, unsigned> PhiToId;
     SmallVector<unsigned> Phis(TE.Scalars.size());
     std::iota(Phis.begin(), Phis.end(), 0);

From a9ffb719bc323588b6b60fbf227db8104a81310e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Aug 2024 09:07:51 -0700
Subject: [PATCH 09/72] [RISCV] Don't promote f16 FNEG/FABS with
 Zfhmin/Zhinxmin. (#106474)

fneg/fabs are not supposed to canonicalize nans. Promoting to f32 will
go through an fp_extend which will canonicalize. The generic Promote
handler needs to be removed from LegalizeDAG.

We need to use integer bit manip to clear the bit instead.

Unfortunately, this is going through the stack due to i16 not being a
legal type. Fixing that will require custom legalization or some other
generic SelectionDAG change.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   9 +-
 llvm/test/CodeGen/RISCV/bfloat-arith.ll       | 603 +++++++++++----
 llvm/test/CodeGen/RISCV/half-arith-strict.ll  | 631 ++++++++++-----
 llvm/test/CodeGen/RISCV/half-arith.ll         | 724 ++++++++++--------
 .../RISCV/half-bitmanip-dagcombines.ll        |  70 +-
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    |  27 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 189 ++++-
 7 files changed, 1533 insertions(+), 720 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dbb9241fe8cad2..09928dcc1f489a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -395,7 +395,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::FADD,          ISD::FSUB,
       ISD::FMUL,          ISD::FMA,
       ISD::FDIV,          ISD::FSQRT,
-      ISD::FABS,          ISD::FNEG,
       ISD::STRICT_FMA,    ISD::STRICT_FADD,
       ISD::STRICT_FSUB,   ISD::STRICT_FMUL,
       ISD::STRICT_FDIV,   ISD::STRICT_FSQRT,
@@ -416,8 +415,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BR_CC, MVT::bf16, Expand);
     setOperationAction(ZfhminZfbfminPromoteOps, MVT::bf16, Promote);
     setOperationAction(ISD::FREM, MVT::bf16, Promote);
-    // FIXME: Need to promote bf16 FCOPYSIGN to f32, but the
-    // DAGCombiner::visitFP_ROUND probably needs improvements first.
+    setOperationAction(ISD::FABS, MVT::bf16, Expand);
+    setOperationAction(ISD::FNEG, MVT::bf16, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
   }
 
@@ -433,8 +432,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::STRICT_LRINT, ISD::STRICT_LLRINT,
                           ISD::STRICT_LROUND, ISD::STRICT_LLROUND},
                          MVT::f16, Legal);
-      // FIXME: Need to promote f16 FCOPYSIGN to f32, but the
-      // DAGCombiner::visitFP_ROUND probably needs improvements first.
+      setOperationAction(ISD::FABS, MVT::f16, Expand);
+      setOperationAction(ISD::FNEG, MVT::f16, Expand);
       setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
     }
 
diff --git a/llvm/test/CodeGen/RISCV/bfloat-arith.ll b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
index 632e933c595671..56a30dd0f6ffee 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-arith.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
@@ -105,17 +105,39 @@ define bfloat @fsgnj_s(bfloat %a, bfloat %b) nounwind {
 }
 
 define i32 @fneg_s(bfloat %a, bfloat %b) nounwind {
-; CHECK-LABEL: fneg_s:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    fadd.s fa5, fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    feq.s a0, fa5, fa4
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fneg_s:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa4, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32IZFBFMIN-NEXT:    feq.s a0, fa5, fa4
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fneg_s:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa4, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV64IZFBFMIN-NEXT:    feq.s a0, fa5, fa4
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %1 = fadd bfloat %a, %a
   %2 = fneg bfloat %1
   %3 = fcmp oeq bfloat %1, %2
@@ -131,9 +153,11 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
-; RV32IZFBFMIN-NEXT:    fneg.s fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 4(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 5(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 5(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 4(sp)
 ; RV32IZFBFMIN-NEXT:    fsh fa0, 8(sp)
 ; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
 ; RV32IZFBFMIN-NEXT:    lbu a0, 9(sp)
@@ -148,24 +172,26 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind {
 ;
 ; RV64IZFBFMIN-LABEL: fsgnjn_s:
 ; RV64IZFBFMIN:       # %bb.0:
-; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -32
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
-; RV64IZFBFMIN-NEXT:    fneg.s fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV64IZFBFMIN-NEXT:    fsh fa0, 0(sp)
 ; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
-; RV64IZFBFMIN-NEXT:    lbu a0, 1(sp)
-; RV64IZFBFMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fsh fa0, 16(sp)
+; RV64IZFBFMIN-NEXT:    fsh fa5, 24(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 17(sp)
+; RV64IZFBFMIN-NEXT:    lbu a1, 25(sp)
 ; RV64IZFBFMIN-NEXT:    andi a0, a0, 127
 ; RV64IZFBFMIN-NEXT:    andi a1, a1, 128
 ; RV64IZFBFMIN-NEXT:    or a0, a0, a1
-; RV64IZFBFMIN-NEXT:    sb a0, 1(sp)
-; RV64IZFBFMIN-NEXT:    flh fa0, 0(sp)
-; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    sb a0, 17(sp)
+; RV64IZFBFMIN-NEXT:    flh fa0, 16(sp)
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 32
 ; RV64IZFBFMIN-NEXT:    ret
   %1 = fadd bfloat %a, %b
   %2 = fneg bfloat %1
@@ -176,19 +202,43 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind {
 declare bfloat @llvm.fabs.bf16(bfloat)
 
 define bfloat @fabs_s(bfloat %a, bfloat %b) nounwind {
-; CHECK-LABEL: fabs_s:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
-; CHECK-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fabs.s fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fabs_s:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    andi a0, a0, 127
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa4, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fabs_s:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    andi a0, a0, 127
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa4, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %1 = fadd bfloat %a, %b
   %2 = call bfloat @llvm.fabs.bf16(bfloat %1)
   %3 = fadd bfloat %2, %1
@@ -239,21 +289,45 @@ define bfloat @fmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fmsub_s:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fmsub_s:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fmsub_s:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %c_ = fadd bfloat 0.0, %c ; avoid negation using xor
   %negc = fsub bfloat -0.0, %c_
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %negc)
@@ -261,27 +335,61 @@ define bfloat @fmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fnmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmadd_s:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
-; CHECK-NEXT:    fadd.s fa4, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fneg.s fa4, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmadd_s:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFBFMIN-NEXT:    flh fa4, 8(sp)
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmadd_s:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 0(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFBFMIN-NEXT:    flh fa4, 0(sp)
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %a_ = fadd bfloat 0.0, %a
   %c_ = fadd bfloat 0.0, %c
   %nega = fsub bfloat -0.0, %a_
@@ -291,27 +399,61 @@ define bfloat @fnmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmadd_s_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
-; CHECK-NEXT:    fadd.s fa4, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fneg.s fa4, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmadd_s_2:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFBFMIN-NEXT:    flh fa4, 8(sp)
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmadd_s_2:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 0(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFBFMIN-NEXT:    flh fa4, 0(sp)
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %b_ = fadd bfloat 0.0, %b
   %c_ = fadd bfloat 0.0, %c
   %negb = fsub bfloat -0.0, %b_
@@ -321,17 +463,37 @@ define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fnmadd_s_3(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmadd_s_3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmadd_s_3:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa0, 12(sp)
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmadd_s_3:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa0, 8(sp)
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %neg = fneg bfloat %1
   ret bfloat %neg
@@ -339,38 +501,82 @@ define bfloat @fnmadd_s_3(bfloat %a, bfloat %b, bfloat %c) nounwind {
 
 
 define bfloat @fnmadd_nsz(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmadd_nsz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmadd_nsz:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa0, 12(sp)
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmadd_nsz:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa0, 8(sp)
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %1 = call nsz bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %neg = fneg nsz bfloat %1
   ret bfloat %neg
 }
 
 define bfloat @fnmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmsub_s:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmsub_s:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa2
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmsub_s:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa2
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %a_ = fadd bfloat 0.0, %a
   %nega = fsub bfloat -0.0, %a_
   %1 = call bfloat @llvm.fma.bf16(bfloat %nega, bfloat %b, bfloat %c)
@@ -378,21 +584,45 @@ define bfloat @fnmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fnmsub_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmsub_s_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmsub_s_2:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa2
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmsub_s_2:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa2
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa0
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %b_ = fadd bfloat 0.0, %b
   %negb = fsub bfloat -0.0, %b_
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %negb, bfloat %c)
@@ -439,30 +669,63 @@ define bfloat @fmsub_s_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 }
 
 define bfloat @fnmadd_s_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
-; CHECK-LABEL: fnmadd_s_contract:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fadd.s fa3, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa3, fa3
-; CHECK-NEXT:    fcvt.s.bf16 fa2, fa2
-; CHECK-NEXT:    fadd.s fa4, fa2, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa3
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fmul.s fa5, fa5, fa3
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fneg.s fa5, fa5
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fsub.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    ret
+; RV32IZFBFMIN-LABEL: fnmadd_s_contract:
+; RV32IZFBFMIN:       # %bb.0:
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV32IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV32IZFBFMIN-NEXT:    fadd.s fa3, fa3, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa3, fa3
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa3
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fmul.s fa5, fa5, fa3
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFBFMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV32IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV32IZFBFMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFBFMIN-NEXT:    flh fa3, 12(sp)
+; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa3
+; RV32IZFBFMIN-NEXT:    fsub.s fa5, fa4, fa5
+; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV32IZFBFMIN-NEXT:    ret
+;
+; RV64IZFBFMIN-LABEL: fnmadd_s_contract:
+; RV64IZFBFMIN:       # %bb.0:
+; RV64IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV64IZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa1
+; RV64IZFBFMIN-NEXT:    fadd.s fa3, fa3, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa3, fa3
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa3, fa3
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fmul.s fa5, fa5, fa3
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFBFMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa2
+; RV64IZFBFMIN-NEXT:    xori a0, a0, 128
+; RV64IZFBFMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFBFMIN-NEXT:    flh fa3, 8(sp)
+; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa3
+; RV64IZFBFMIN-NEXT:    fsub.s fa5, fa4, fa5
+; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
+; RV64IZFBFMIN-NEXT:    addi sp, sp, 16
+; RV64IZFBFMIN-NEXT:    ret
   %a_ = fadd bfloat 0.0, %a ; avoid negation using xor
   %b_ = fadd bfloat 0.0, %b ; avoid negation using xor
   %c_ = fadd bfloat 0.0, %c ; avoid negation using xor
diff --git a/llvm/test/CodeGen/RISCV/half-arith-strict.ll b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
index 02cd91c7075940..4c7096f4045e2b 100644
--- a/llvm/test/CodeGen/RISCV/half-arith-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
@@ -11,16 +11,16 @@
 ; RUN:   | FileCheck -check-prefix=CHECK-ZHINX %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zfhmin -verify-machineinstrs \
 ; RUN:   -disable-strictnode-mutation -target-abi ilp32f < %s \
-; RUN:   | FileCheck -check-prefix=CHECK-ZFHMIN %s
+; RUN:   | FileCheck -check-prefixes=CHECK-ZFHMIN,CHECK-ZFHMIN-RV32 %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zfhmin -verify-machineinstrs \
 ; RUN:   -disable-strictnode-mutation -target-abi lp64f < %s \
-; RUN:  | FileCheck -check-prefix=CHECK-ZFHMIN %s
+; RUN:  | FileCheck -check-prefixes=CHECK-ZFHMIN,CHECK-ZFHMIN-RV64 %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \
 ; RUN:   -disable-strictnode-mutation -target-abi ilp32 < %s \
-; RUN:   | FileCheck -check-prefix=CHECK-ZHINXMIN %s
+; RUN:   | FileCheck -check-prefixes=CHECK-ZHINXMIN,CHECK-ZHINXMIN-RV32 %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \
 ; RUN:   -disable-strictnode-mutation -target-abi lp64 < %s \
-; RUN:   | FileCheck -check-prefix=CHECK-ZHINXMIN %s
+; RUN:   | FileCheck -check-prefixes=CHECK-ZHINXMIN,CHECK-ZHINXMIN-RV64 %s
 
 ; FIXME: We can't test without Zfh because soft promote legalization isn't
 ; implemented in SelectionDAG for STRICT nodes.
@@ -239,36 +239,83 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINX-NEXT:    fmsub.h a0, a0, a1, a2
 ; CHECK-ZHINX-NEXT:    ret
 ;
-; CHECK-ZFHMIN-LABEL: fmsub_h:
-; CHECK-ZFHMIN:       # %bb.0:
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-ZFHMIN-NEXT:    ret
-;
-; CHECK-ZHINXMIN-LABEL: fmsub_h:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECK-ZFHMIN-RV32-LABEL: fmsub_h:
+; CHECK-ZFHMIN-RV32:       # %bb.0:
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV32-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV32-NEXT:    ret
+;
+; CHECK-ZFHMIN-RV64-LABEL: fmsub_h:
+; CHECK-ZFHMIN-RV64:       # %bb.0:
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV64-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV64-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV32-LABEL: fmsub_h:
+; CHECK-ZHINXMIN-RV32:       # %bb.0:
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV32-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV64-LABEL: fmsub_h:
+; CHECK-ZHINXMIN-RV64:       # %bb.0:
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV64-NEXT:    ret
   %c_ = fadd half 0.0, %c ; avoid negation using xor
   %negc = fneg half %c_
   %1 = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %negc, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
@@ -291,48 +338,115 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
 ; CHECK-ZHINX-NEXT:    ret
 ;
-; CHECK-ZFHMIN-LABEL: fnmadd_h:
-; CHECK-ZFHMIN:       # %bb.0:
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
-; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fneg.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-ZFHMIN-NEXT:    ret
-;
-; CHECK-ZHINXMIN-LABEL: fnmadd_h:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECK-ZFHMIN-RV32-LABEL: fnmadd_h:
+; CHECK-ZFHMIN-RV32:       # %bb.0:
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-ZFHMIN-RV32-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa4, 8(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV32-NEXT:    ret
+;
+; CHECK-ZFHMIN-RV64-LABEL: fnmadd_h:
+; CHECK-ZFHMIN-RV64:       # %bb.0:
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-ZFHMIN-RV64-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 0(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 1(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 1(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV64-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV32-LABEL: fnmadd_h:
+; CHECK-ZHINXMIN-RV32:       # %bb.0:
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a0, a0, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a0, 8(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a0, 9(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a0, 9(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a0, 8(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV32-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV64-LABEL: fnmadd_h:
+; CHECK-ZHINXMIN-RV64:       # %bb.0:
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a0, a0, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a0, 0(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a0, 1(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a0, 1(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a0, 0(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV64-NEXT:    ret
   %a_ = fadd half 0.0, %a
   %c_ = fadd half 0.0, %c
   %nega = fneg half %a_
@@ -357,48 +471,115 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINX-NEXT:    fnmadd.h a0, a1, a0, a2
 ; CHECK-ZHINX-NEXT:    ret
 ;
-; CHECK-ZFHMIN-LABEL: fnmadd_h_2:
-; CHECK-ZFHMIN:       # %bb.0:
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
-; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fneg.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-ZFHMIN-NEXT:    ret
-;
-; CHECK-ZHINXMIN-LABEL: fnmadd_h_2:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECK-ZFHMIN-RV32-LABEL: fnmadd_h_2:
+; CHECK-ZFHMIN-RV32:       # %bb.0:
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-ZFHMIN-RV32-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa4, 8(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV32-NEXT:    ret
+;
+; CHECK-ZFHMIN-RV64-LABEL: fnmadd_h_2:
+; CHECK-ZFHMIN-RV64:       # %bb.0:
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-ZFHMIN-RV64-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 0(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 1(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 1(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV64-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV32-LABEL: fnmadd_h_2:
+; CHECK-ZHINXMIN-RV32:       # %bb.0:
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a1, a1, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a1, 8(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a1, 9(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a1, a1, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a1, 9(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a1, 8(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a2, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a2, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV32-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV64-LABEL: fnmadd_h_2:
+; CHECK-ZHINXMIN-RV64:       # %bb.0:
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a1, a1, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a1, 0(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a1, 1(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a1, a1, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a1, 1(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a1, 0(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a2, a2, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a2, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a2, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV64-NEXT:    ret
   %b_ = fadd half 0.0, %b
   %c_ = fadd half 0.0, %c
   %negb = fneg half %b_
@@ -421,36 +602,83 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINX-NEXT:    fnmsub.h a0, a0, a1, a2
 ; CHECK-ZHINX-NEXT:    ret
 ;
-; CHECK-ZFHMIN-LABEL: fnmsub_h:
-; CHECK-ZFHMIN:       # %bb.0:
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-ZFHMIN-NEXT:    ret
-;
-; CHECK-ZHINXMIN-LABEL: fnmsub_h:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECK-ZFHMIN-RV32-LABEL: fnmsub_h:
+; CHECK-ZFHMIN-RV32:       # %bb.0:
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-ZFHMIN-RV32-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV32-NEXT:    ret
+;
+; CHECK-ZFHMIN-RV64-LABEL: fnmsub_h:
+; CHECK-ZFHMIN-RV64:       # %bb.0:
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-ZFHMIN-RV64-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV64-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV32-LABEL: fnmsub_h:
+; CHECK-ZHINXMIN-RV32:       # %bb.0:
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a0, a0, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a0, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a0, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV32-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV64-LABEL: fnmsub_h:
+; CHECK-ZHINXMIN-RV64:       # %bb.0:
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a0, a0, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a0, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a0, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV64-NEXT:    ret
   %a_ = fadd half 0.0, %a
   %nega = fneg half %a_
   %1 = call half @llvm.experimental.constrained.fma.f16(half %nega, half %b, half %c, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
@@ -471,36 +699,83 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINX-NEXT:    fnmsub.h a0, a1, a0, a2
 ; CHECK-ZHINX-NEXT:    ret
 ;
-; CHECK-ZFHMIN-LABEL: fnmsub_h_2:
-; CHECK-ZFHMIN:       # %bb.0:
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-ZFHMIN-NEXT:    ret
-;
-; CHECK-ZHINXMIN-LABEL: fnmsub_h_2:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECK-ZFHMIN-RV32-LABEL: fnmsub_h_2:
+; CHECK-ZFHMIN-RV32:       # %bb.0:
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-ZFHMIN-RV32-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV32-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fsh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    lbu a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV32-NEXT:    sb a0, 13(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-ZFHMIN-RV32-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV32-NEXT:    ret
+;
+; CHECK-ZFHMIN-RV64-LABEL: fnmsub_h_2:
+; CHECK-ZFHMIN-RV64:       # %bb.0:
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-ZFHMIN-RV64-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-RV64-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fsh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    lbu a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    xori a0, a0, 128
+; CHECK-ZFHMIN-RV64-NEXT:    sb a0, 9(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-ZFHMIN-RV64-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-ZFHMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZFHMIN-RV64-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV32-LABEL: fnmsub_h_2:
+; CHECK-ZHINXMIN-RV32:       # %bb.0:
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fadd.s a1, a1, zero
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    sh a1, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lbu a1, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    xori a1, a1, 128
+; CHECK-ZHINXMIN-RV32-NEXT:    sb a1, 13(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    lh a1, 12(sp)
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV32-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV32-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV32-NEXT:    ret
+;
+; CHECK-ZHINXMIN-RV64-LABEL: fnmsub_h_2:
+; CHECK-ZHINXMIN-RV64:       # %bb.0:
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fadd.s a1, a1, zero
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    sh a1, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lbu a1, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    xori a1, a1, 128
+; CHECK-ZHINXMIN-RV64-NEXT:    sb a1, 9(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    lh a1, 8(sp)
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-RV64-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECK-ZHINXMIN-RV64-NEXT:    fcvt.h.s a0, a0
+; CHECK-ZHINXMIN-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ZHINXMIN-RV64-NEXT:    ret
   %b_ = fadd half 0.0, %b
   %negb = fneg half %b_
   %1 = call half @llvm.experimental.constrained.fma.f16(half %a, half %negb, half %c, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index 10e63e3a9f7483..59981a282ab43e 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -630,29 +630,39 @@ define i32 @fneg_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fneg_s:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    feq.s a0, fa5, fa4
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fneg_s:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV32-FSGNJ-NEXT:    feq.s a0, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fneg_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fneg.s a1, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    feq.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fneg_s:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV64-FSGNJ-NEXT:    feq.s a0, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fneg_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
@@ -771,9 +781,11 @@ define half @fsgnjn_s(half %a, half %b) nounwind {
 ; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
 ; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
 ; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fneg.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 4(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 5(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 5(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 4(sp)
 ; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 8(sp)
 ; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
 ; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
@@ -788,24 +800,26 @@ define half @fsgnjn_s(half %a, half %b) nounwind {
 ;
 ; CHECK-RV64-FSGNJ-LABEL: fsgnjn_s:
 ; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -32
 ; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
 ; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
 ; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
 ; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fneg.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 0(sp)
 ; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 16(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 24(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 17(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 25(sp)
 ; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 127
 ; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 128
 ; CHECK-RV64-FSGNJ-NEXT:    or a0, a0, a1
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 17(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 16(sp)
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 32
 ; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fsgnjn_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
@@ -971,33 +985,43 @@ define half @fabs_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fabs_s:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fabs.s fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fabs_s:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 127
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fabs_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fabs.s a1, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a1, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fabs_s:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 127
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fabs_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
@@ -1409,36 +1433,45 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fmsub_s:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fmsub_s:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmsub_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fmsub_s:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fmsub_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -1591,48 +1624,61 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmadd_s:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
-; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fneg.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmadd_s:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmadd_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fnmadd_s:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fnmadd_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
@@ -1793,48 +1839,61 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmadd_s_2:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
-; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fneg.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_2:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmadd_s_2:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_2:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fnmadd_s_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
@@ -1959,17 +2018,37 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmadd_s_3:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_3:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
+;
+; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_3:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ;
 ; CHECKZHINXMIN-LABEL: fnmadd_s_3:
 ; CHECKZHINXMIN:       # %bb.0:
@@ -2090,17 +2169,37 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmadd_nsz:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmadd_nsz:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
+;
+; CHECK-RV64-FSGNJ-LABEL: fnmadd_nsz:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ;
 ; CHECKZHINXMIN-LABEL: fnmadd_nsz:
 ; CHECKZHINXMIN:       # %bb.0:
@@ -2227,36 +2326,45 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmsub_s:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmsub_s:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmsub_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fnmsub_s:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fnmsub_s:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
@@ -2379,36 +2487,45 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmsub_s_2:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmsub_s_2:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmsub_s_2:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fnmsub_s_2:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fnmsub_s_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
@@ -2847,54 +2964,63 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fnmadd_s_contract:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa3, fa3
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa2, fa2
-; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa2, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa3
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmul.s fa5, fa5, fa3
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fneg.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fsub.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_contract:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa3, fa3, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa3, fa3
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa3
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmul.s fa5, fa5, fa3
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa3, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa3
+; CHECK-RV32-FSGNJ-NEXT:    fsub.s fa5, fa4, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmadd_s_contract:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECKZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
+; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_contract:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa3, fa3, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa3, fa3
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa3
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmul.s fa5, fa5, fa3
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
+; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa3, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa3
+; CHECK-RV64-FSGNJ-NEXT:    fsub.s fa5, fa4, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
 ; CHECK-ZHINXMIN-LABEL: fnmadd_s_contract:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
diff --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
index a103a9e09d1498..c824e7f9845951 100644
--- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
@@ -208,13 +208,15 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32IZFHMIN-LABEL: fcopysign_fneg:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
-; RV32IZFHMIN-NEXT:    fmv.h.x fa5, a0
-; RV32IZFHMIN-NEXT:    fmv.h.x fa4, a1
-; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; RV32IZFHMIN-NEXT:    fneg.s fa4, fa4
-; RV32IZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; RV32IZFHMIN-NEXT:    fsh fa5, 8(sp)
-; RV32IZFHMIN-NEXT:    fsh fa4, 12(sp)
+; RV32IZFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV32IZFHMIN-NEXT:    fsh fa5, 4(sp)
+; RV32IZFHMIN-NEXT:    lbu a1, 5(sp)
+; RV32IZFHMIN-NEXT:    xori a1, a1, 128
+; RV32IZFHMIN-NEXT:    sb a1, 5(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 4(sp)
+; RV32IZFHMIN-NEXT:    fmv.h.x fa4, a0
+; RV32IZFHMIN-NEXT:    fsh fa4, 8(sp)
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
 ; RV32IZFHMIN-NEXT:    lbu a0, 9(sp)
 ; RV32IZFHMIN-NEXT:    lbu a1, 13(sp)
 ; RV32IZFHMIN-NEXT:    andi a0, a0, 127
@@ -228,31 +230,35 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ;
 ; RV64IZFHMIN-LABEL: fcopysign_fneg:
 ; RV64IZFHMIN:       # %bb.0:
-; RV64IZFHMIN-NEXT:    addi sp, sp, -16
-; RV64IZFHMIN-NEXT:    fmv.h.x fa5, a0
-; RV64IZFHMIN-NEXT:    fmv.h.x fa4, a1
-; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; RV64IZFHMIN-NEXT:    fneg.s fa4, fa4
-; RV64IZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; RV64IZFHMIN-NEXT:    fsh fa5, 0(sp)
-; RV64IZFHMIN-NEXT:    fsh fa4, 8(sp)
-; RV64IZFHMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, -32
+; RV64IZFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
 ; RV64IZFHMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a1, a1, 128
+; RV64IZFHMIN-NEXT:    sb a1, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fmv.h.x fa4, a0
+; RV64IZFHMIN-NEXT:    fsh fa4, 16(sp)
+; RV64IZFHMIN-NEXT:    fsh fa5, 24(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 17(sp)
+; RV64IZFHMIN-NEXT:    lbu a1, 25(sp)
 ; RV64IZFHMIN-NEXT:    andi a0, a0, 127
 ; RV64IZFHMIN-NEXT:    andi a1, a1, 128
 ; RV64IZFHMIN-NEXT:    or a0, a0, a1
-; RV64IZFHMIN-NEXT:    sb a0, 1(sp)
-; RV64IZFHMIN-NEXT:    flh fa5, 0(sp)
+; RV64IZFHMIN-NEXT:    sb a0, 17(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 16(sp)
 ; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa5
-; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    addi sp, sp, 32
 ; RV64IZFHMIN-NEXT:    ret
 ;
 ; RV32IZHINXMIN-LABEL: fcopysign_fneg:
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
-; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    fneg.s a1, a1
-; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    sh a1, 4(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 5(sp)
+; RV32IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV32IZHINXMIN-NEXT:    sb a1, 5(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 4(sp)
 ; RV32IZHINXMIN-NEXT:    sh a0, 8(sp)
 ; RV32IZHINXMIN-NEXT:    sh a1, 12(sp)
 ; RV32IZHINXMIN-NEXT:    lbu a0, 9(sp)
@@ -267,20 +273,22 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ;
 ; RV64IZHINXMIN-LABEL: fcopysign_fneg:
 ; RV64IZHINXMIN:       # %bb.0:
-; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
-; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    fneg.s a1, a1
-; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; RV64IZHINXMIN-NEXT:    sh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -32
 ; RV64IZHINXMIN-NEXT:    sh a1, 8(sp)
-; RV64IZHINXMIN-NEXT:    lbu a0, 1(sp)
 ; RV64IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV64IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    sh a0, 16(sp)
+; RV64IZHINXMIN-NEXT:    sh a1, 24(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 17(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 25(sp)
 ; RV64IZHINXMIN-NEXT:    andi a0, a0, 127
 ; RV64IZHINXMIN-NEXT:    andi a1, a1, 128
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
-; RV64IZHINXMIN-NEXT:    sb a0, 1(sp)
-; RV64IZHINXMIN-NEXT:    lh a0, 0(sp)
-; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    sb a0, 17(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 16(sp)
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 32
 ; RV64IZHINXMIN-NEXT:    ret
   %1 = fneg half %b
   %2 = call half @llvm.copysign.f16(half %a, half %1)
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index 4587c442cda5b3..7f1eebdf64a551 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1821,12 +1821,27 @@ define half @fabs_f16(half %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    ret
 ;
-; CHECKIZFHMIN-LABEL: fabs_f16:
-; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-NEXT:    fabs.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fabs_f16:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fsh fa0, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    andi a0, a0, 127
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa0, 12(sp)
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fabs_f16:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fsh fa0, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    andi a0, a0, 127
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa0, 8(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
 ;
 ; RV32IZHINXMIN-LABEL: fabs_f16:
 ; RV32IZHINXMIN:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index cdbca0b874e607..fb9c0a57fd1bee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -650,38 +650,165 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fabs_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: fabs_v6f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vfabs.v v8, v9
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, a0, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
-; ZVFHMIN-RV64-LABEL: fabs_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-ZFH-RV64-LABEL: fabs_v6f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vfabs.v v8, v9
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: fabs_v6f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -64
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 64
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 36(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 32(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 37(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 37(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 33(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 33(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 29(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 29(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 25(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 25(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 21(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 21(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lbu a1, 17(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV32-NEXT:    sb a1, 17(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 36(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 58(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 32(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 56(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 54(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 52(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa3, 50(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa2, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa2, 48(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 48
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 46(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 44(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa3, 42(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa2, 40(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 40
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 64
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: fabs_v6f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -80
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 80
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 56(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 48(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 40(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 32(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 57(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 57(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 49(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 49(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 41(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 41(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 33(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 33(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 25(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 25(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lbu a1, 17(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    andi a1, a1, 127
+; ZVFHMIN-ZFHIN-RV64-NEXT:    sb a1, 17(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 56(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 74(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 48(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 72(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 40(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 70(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 32(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 68(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 66(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 64(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 64
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 80
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x

From e05c22484efb5c767115525adfa4273e48b1ae26 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 29 Aug 2024 20:12:14 +0400
Subject: [PATCH 10/72] AArch64: Delete tests of fp128 atomicrmw fmin/fmax

These are getting different output on some build hosts for some reason.
The stack offsets of temporaries are different.
---
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 285 +++++---------------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 285 +++++---------------
 2 files changed, 146 insertions(+), 424 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 12a0c1169f2b6a..bfe0d20ca814bc 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -3,8 +3,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
 
-; FIXME: Windows hosts assigns stack slots to different offsets for some reason.
-; UNSUPPORTED: system-windows
+; FIXME: Restore test of fp128 case
 
 define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
@@ -508,144 +507,6 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #
   ret double %res
 }
 
-define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
-; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
-; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    sub sp, sp, #96
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; NOLSE-NEXT:    mov x19, x0
-; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; NOLSE-NEXT:    b .LBB6_2
-; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; NOLSE-NEXT:    stp x12, x13, [sp, #32]
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    ldr q1, [sp, #32]
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    b.eq .LBB6_6
-; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; NOLSE-NEXT:    bl fmaxl
-; NOLSE-NEXT:    str q0, [sp, #48]
-; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
-; NOLSE-NEXT:    str q0, [sp, #64]
-; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
-; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x19]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB6_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    add sp, sp, #96
-; NOLSE-NEXT:    ret
-;
-; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
-; LSE:       // %bb.0:
-; LSE-NEXT:    sub sp, sp, #96
-; LSE-NEXT:    ldr q1, [x0]
-; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; LSE-NEXT:    mov x19, x0
-; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov v0.16b, v1.16b
-; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; LSE-NEXT:    bl fmaxl
-; LSE-NEXT:    str q0, [sp, #48]
-; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; LSE-NEXT:    ldp x0, x1, [sp, #48]
-; LSE-NEXT:    str q0, [sp, #64]
-; LSE-NEXT:    ldp x2, x3, [sp, #64]
-; LSE-NEXT:    mov x4, x2
-; LSE-NEXT:    mov x5, x3
-; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
-; LSE-NEXT:    stp x4, x5, [sp, #32]
-; LSE-NEXT:    cmp x5, x3
-; LSE-NEXT:    ldr q1, [sp, #32]
-; LSE-NEXT:    ccmp x4, x2, #0, eq
-; LSE-NEXT:    b.ne .LBB6_1
-; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; LSE-NEXT:    mov v0.16b, v1.16b
-; LSE-NEXT:    add sp, sp, #96
-; LSE-NEXT:    ret
-;
-; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16:
-; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    mov x19, x3
-; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
-; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov x21, x2
-; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x1, x22
-; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
-; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x2, x21
-; SOFTFP-NOLSE-NEXT:    mov x3, x19
-; SOFTFP-NOLSE-NEXT:    mov x22, x1
-; SOFTFP-NOLSE-NEXT:    mov x23, x0
-; SOFTFP-NOLSE-NEXT:    bl fmaxl
-; SOFTFP-NOLSE-NEXT:    mov x8, x0
-; SOFTFP-NOLSE-NEXT:    mov x9, x1
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x0, x23
-; SOFTFP-NOLSE-NEXT:    cset w10, ne
-; SOFTFP-NOLSE-NEXT:    cmp x1, x22
-; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
-; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ret
-  %res = atomicrmw fmax ptr %ptr, fp128 %value seq_cst, align 16
-  ret fp128 %res
-}
-
 define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
@@ -653,7 +514,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    fcvt s0, h0
 ; NOLSE-NEXT:    fcvt s1, h1
-; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w8, [x0]
 ; NOLSE-NEXT:    fmov s2, w8
@@ -667,7 +528,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE-NEXT:    mov v2.h[1], v3.h[0]
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    stlxr w10, w9, [x0]
-; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:    cbnz w10, .LBB6_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
@@ -679,7 +540,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; LSE-NEXT:    fcvt s2, h0
 ; LSE-NEXT:    ldr s0, [x0]
 ; LSE-NEXT:    fcvt s1, h1
-; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    fcvt s4, h0
@@ -695,7 +556,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; LSE-NEXT:    casal w10, w9, [x0]
 ; LSE-NEXT:    fmov s0, w10
 ; LSE-NEXT:    cmp w10, w8
-; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:    b.ne .LBB6_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; LSE-NEXT:    ret
@@ -711,16 +572,16 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
 ; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
-; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
@@ -740,18 +601,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -775,7 +636,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    fmov s1, w10
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
@@ -803,7 +664,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    mov v3.h[1], v2.h[0]
 ; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    stlxr w11, w10, [x0]
-; NOLSE-NEXT:    cbnz w11, .LBB8_1
+; NOLSE-NEXT:    cbnz w11, .LBB7_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, x9
 ; NOLSE-NEXT:    ret
@@ -820,7 +681,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-NEXT:    fmov s2, w10
 ; LSE-NEXT:    lsl w9, w9, #16
 ; LSE-NEXT:    fmov s1, w9
-; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    fmov w10, s0
@@ -850,7 +711,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-NEXT:    casal w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
 ; LSE-NEXT:    cmp w11, w9
-; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:    b.ne .LBB7_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; LSE-NEXT:    ret
@@ -867,15 +728,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
 ; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
-; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    mov w0, w23
@@ -888,18 +749,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
 ; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -913,14 +774,14 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr x8, [x0]
 ; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fmaxnm v2.2s, v1.2s, v0.2s
 ; NOLSE-NEXT:    fmov x8, d2
 ; NOLSE-NEXT:    stlxr w9, x8, [x0]
-; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:    cbnz w9, .LBB8_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
@@ -928,7 +789,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; LSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    ldr d1, [x0]
-; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    fmaxnm v2.2s, v1.2s, v0.2s
 ; LSE-NEXT:    fmov x8, d1
@@ -937,7 +798,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; LSE-NEXT:    casal x10, x9, [x0]
 ; LSE-NEXT:    fmov d1, x10
 ; LSE-NEXT:    cmp x10, x8
-; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:    b.ne .LBB8_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    fmov d0, d1
 ; LSE-NEXT:    ret
@@ -952,16 +813,16 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
 ; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
-; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
@@ -973,18 +834,18 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    mov w9, w0
 ; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
 ; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -999,7 +860,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxp x8, x9, [x0]
 ; NOLSE-NEXT:    fmov d1, x8
@@ -1008,7 +869,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; NOLSE-NEXT:    mov x8, v2.d[1]
 ; NOLSE-NEXT:    fmov x9, d2
 ; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
-; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:    cbnz w10, .LBB9_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
@@ -1016,7 +877,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; LSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    ldr q1, [x0]
-; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    fmaxnm v2.2d, v1.2d, v0.2d
 ; LSE-NEXT:    mov x3, v1.d[1]
@@ -1030,7 +891,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; LSE-NEXT:    cmp x7, x3
 ; LSE-NEXT:    ccmp x6, x2, #0, eq
 ; LSE-NEXT:    mov v1.d[1], x7
-; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:    b.ne .LBB9_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    mov v0.16b, v1.16b
 ; LSE-NEXT:    ret
@@ -1045,15 +906,15 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov x21, x2
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    b .LBB10_2
-; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    cmp x1, x22
 ; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
-; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_6
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    mov x22, x1
 ; SOFTFP-NOLSE-NEXT:    mov x23, x0
 ; SOFTFP-NOLSE-NEXT:    mov x0, x1
@@ -1064,26 +925,26 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmax
 ; SOFTFP-NOLSE-NEXT:    mov x8, x0
-; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp x0, x23
 ; SOFTFP-NOLSE-NEXT:    cset w9, ne
 ; SOFTFP-NOLSE-NEXT:    cmp x1, x22
 ; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
-; SOFTFP-NOLSE-NEXT:    b .LBB10_1
-; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
-; SOFTFP-NOLSE-NEXT:    b .LBB10_1
-; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index 71765f435d94cf..6b7d2df044460a 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -3,8 +3,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s
 
-; FIXME: Windows hosts assigns stack slots to different offsets for some reason.
-; UNSUPPORTED: system-windows
+; FIXME: Restore test of fp128 case
 
 define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
@@ -508,144 +507,6 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #
   ret double %res
 }
 
-define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
-; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
-; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    sub sp, sp, #96
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; NOLSE-NEXT:    mov x19, x0
-; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; NOLSE-NEXT:    b .LBB6_2
-; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; NOLSE-NEXT:    stp x12, x13, [sp, #32]
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    ldr q1, [sp, #32]
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    b.eq .LBB6_6
-; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; NOLSE-NEXT:    bl fminl
-; NOLSE-NEXT:    str q0, [sp, #48]
-; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
-; NOLSE-NEXT:    str q0, [sp, #64]
-; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
-; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x19]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB6_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    add sp, sp, #96
-; NOLSE-NEXT:    ret
-;
-; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
-; LSE:       // %bb.0:
-; LSE-NEXT:    sub sp, sp, #96
-; LSE-NEXT:    ldr q1, [x0]
-; LSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; LSE-NEXT:    mov x19, x0
-; LSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov v0.16b, v1.16b
-; LSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; LSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; LSE-NEXT:    bl fminl
-; LSE-NEXT:    str q0, [sp, #48]
-; LSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; LSE-NEXT:    ldp x0, x1, [sp, #48]
-; LSE-NEXT:    str q0, [sp, #64]
-; LSE-NEXT:    ldp x2, x3, [sp, #64]
-; LSE-NEXT:    mov x4, x2
-; LSE-NEXT:    mov x5, x3
-; LSE-NEXT:    caspal x4, x5, x0, x1, [x19]
-; LSE-NEXT:    stp x4, x5, [sp, #32]
-; LSE-NEXT:    cmp x5, x3
-; LSE-NEXT:    ldr q1, [sp, #32]
-; LSE-NEXT:    ccmp x4, x2, #0, eq
-; LSE-NEXT:    b.ne .LBB6_1
-; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; LSE-NEXT:    mov v0.16b, v1.16b
-; LSE-NEXT:    add sp, sp, #96
-; LSE-NEXT:    ret
-;
-; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16:
-; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    mov x19, x3
-; SOFTFP-NOLSE-NEXT:    ldp x0, x1, [x0]
-; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov x21, x2
-; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x1, x22
-; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_6
-; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x2, x21
-; SOFTFP-NOLSE-NEXT:    mov x3, x19
-; SOFTFP-NOLSE-NEXT:    mov x22, x1
-; SOFTFP-NOLSE-NEXT:    mov x23, x0
-; SOFTFP-NOLSE-NEXT:    bl fminl
-; SOFTFP-NOLSE-NEXT:    mov x8, x0
-; SOFTFP-NOLSE-NEXT:    mov x9, x1
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x0, x23
-; SOFTFP-NOLSE-NEXT:    cset w10, ne
-; SOFTFP-NOLSE-NEXT:    cmp x1, x22
-; SOFTFP-NOLSE-NEXT:    cinc w10, w10, ne
-; SOFTFP-NOLSE-NEXT:    cbz w10, .LBB6_5
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxp w10, x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxp w10, x8, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ret
-  %res = atomicrmw fmin ptr %ptr, fp128 %value seq_cst, align 16
-  ret fp128 %res
-}
-
 define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
@@ -653,7 +514,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    fcvt s0, h0
 ; NOLSE-NEXT:    fcvt s1, h1
-; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w8, [x0]
 ; NOLSE-NEXT:    fmov s2, w8
@@ -667,7 +528,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; NOLSE-NEXT:    mov v2.h[1], v3.h[0]
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    stlxr w10, w9, [x0]
-; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:    cbnz w10, .LBB6_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
@@ -679,7 +540,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; LSE-NEXT:    fcvt s2, h0
 ; LSE-NEXT:    ldr s0, [x0]
 ; LSE-NEXT:    fcvt s1, h1
-; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    fcvt s4, h0
@@ -695,7 +556,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; LSE-NEXT:    casal w10, w9, [x0]
 ; LSE-NEXT:    fmov s0, w10
 ; LSE-NEXT:    cmp w10, w8
-; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:    b.ne .LBB6_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; LSE-NEXT:    ret
@@ -711,16 +572,16 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
 ; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
-; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    and w0, w19, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
@@ -740,18 +601,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -775,7 +636,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    fmov s1, w10
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
@@ -803,7 +664,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-NEXT:    mov v3.h[1], v2.h[0]
 ; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    stlxr w11, w10, [x0]
-; NOLSE-NEXT:    cbnz w11, .LBB8_1
+; NOLSE-NEXT:    cbnz w11, .LBB7_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, x9
 ; NOLSE-NEXT:    ret
@@ -820,7 +681,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-NEXT:    fmov s2, w10
 ; LSE-NEXT:    lsl w9, w9, #16
 ; LSE-NEXT:    fmov s1, w9
-; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    fmov w10, s0
@@ -850,7 +711,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-NEXT:    casal w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
 ; LSE-NEXT:    cmp w11, w9
-; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:    b.ne .LBB7_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; LSE-NEXT:    ret
@@ -867,15 +728,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
 ; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
-; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    lsl w23, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    mov w0, w23
@@ -888,18 +749,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
 ; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -913,14 +774,14 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr x8, [x0]
 ; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fminnm v2.2s, v1.2s, v0.2s
 ; NOLSE-NEXT:    fmov x8, d2
 ; NOLSE-NEXT:    stlxr w9, x8, [x0]
-; NOLSE-NEXT:    cbnz w9, .LBB9_1
+; NOLSE-NEXT:    cbnz w9, .LBB8_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
@@ -928,7 +789,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; LSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    ldr d1, [x0]
-; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    fminnm v2.2s, v1.2s, v0.2s
 ; LSE-NEXT:    fmov x8, d1
@@ -937,7 +798,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; LSE-NEXT:    casal x10, x9, [x0]
 ; LSE-NEXT:    fmov d1, x10
 ; LSE-NEXT:    cmp x10, x8
-; LSE-NEXT:    b.ne .LBB9_1
+; LSE-NEXT:    b.ne .LBB8_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    fmov d0, d1
 ; LSE-NEXT:    ret
@@ -952,16 +813,16 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
-; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
 ; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
-; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fminf
@@ -973,18 +834,18 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    mov w9, w0
 ; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
 ; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
+; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -999,7 +860,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxp x8, x9, [x0]
 ; NOLSE-NEXT:    fmov d1, x8
@@ -1008,7 +869,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; NOLSE-NEXT:    mov x8, v2.d[1]
 ; NOLSE-NEXT:    fmov x9, d2
 ; NOLSE-NEXT:    stlxp w10, x9, x8, [x0]
-; NOLSE-NEXT:    cbnz w10, .LBB10_1
+; NOLSE-NEXT:    cbnz w10, .LBB9_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    mov v0.16b, v1.16b
 ; NOLSE-NEXT:    ret
@@ -1016,7 +877,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; LSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    ldr q1, [x0]
-; LSE-NEXT:  .LBB10_1: // %atomicrmw.start
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; LSE-NEXT:    fminnm v2.2d, v1.2d, v0.2d
 ; LSE-NEXT:    mov x3, v1.d[1]
@@ -1030,7 +891,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; LSE-NEXT:    cmp x7, x3
 ; LSE-NEXT:    ccmp x6, x2, #0, eq
 ; LSE-NEXT:    mov v1.d[1], x7
-; LSE-NEXT:    b.ne .LBB10_1
+; LSE-NEXT:    b.ne .LBB9_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; LSE-NEXT:    mov v0.16b, v1.16b
 ; LSE-NEXT:    ret
@@ -1045,15 +906,15 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov x21, x2
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    b .LBB10_2
-; SOFTFP-NOLSE-NEXT:  .LBB10_1: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    b .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    cmp x1, x22
 ; SOFTFP-NOLSE-NEXT:    ccmp x0, x23, #0, eq
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB10_6
-; SOFTFP-NOLSE-NEXT:  .LBB10_2: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_6
+; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; SOFTFP-NOLSE-NEXT:    // Child Loop BB10_3 Depth 2
+; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
 ; SOFTFP-NOLSE-NEXT:    mov x22, x1
 ; SOFTFP-NOLSE-NEXT:    mov x23, x0
 ; SOFTFP-NOLSE-NEXT:    mov x0, x1
@@ -1064,26 +925,26 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou
 ; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmin
 ; SOFTFP-NOLSE-NEXT:    mov x8, x0
-; SOFTFP-NOLSE-NEXT:  .LBB10_3: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // Parent Loop BB10_2 Depth=1
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxp x0, x1, [x20]
 ; SOFTFP-NOLSE-NEXT:    cmp x0, x23
 ; SOFTFP-NOLSE-NEXT:    cset w9, ne
 ; SOFTFP-NOLSE-NEXT:    cmp x1, x22
 ; SOFTFP-NOLSE-NEXT:    cinc w9, w9, ne
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB10_5
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxp w9, x0, x1, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
-; SOFTFP-NOLSE-NEXT:    b .LBB10_1
-; SOFTFP-NOLSE-NEXT:  .LBB10_5: // %atomicrmw.start
-; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB10_3 Depth=2
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
 ; SOFTFP-NOLSE-NEXT:    stlxp w9, x8, x24, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB10_3
-; SOFTFP-NOLSE-NEXT:    b .LBB10_1
-; SOFTFP-NOLSE-NEXT:  .LBB10_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload

From c08c6a71cfc536e22fb7ad733fb8181a9e84e62a Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@meta.com>
Date: Thu, 29 Aug 2024 09:20:59 -0700
Subject: [PATCH 11/72] [mlir][scf] Allow unrolling loops with integer-typed
 IV. (#106164)

SCF loops now can operate on integer-typed IV, thus I'm changing the
loop unroller correspondingly.
---
 mlir/lib/Dialect/SCF/Utils/Utils.cpp   | 35 +++++++++++++---------
 mlir/test/Dialect/SCF/loop-unroll.mlir | 41 ++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 9545610f10be7c..a794a121d6267b 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -270,11 +270,13 @@ bool mlir::getInnermostParallelLoops(Operation *rootOp,
 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
                              int64_t divisor) {
   assert(divisor > 0 && "expected positive divisor");
-  assert(dividend.getType().isIndex() && "expected index-typed value");
+  assert(dividend.getType().isIntOrIndex() &&
+         "expected integer or index-typed value");
 
-  Value divisorMinusOneCst =
-      builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
-  Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
+  Value divisorMinusOneCst = builder.create<arith::ConstantOp>(
+      loc, builder.getIntegerAttr(dividend.getType(), divisor - 1));
+  Value divisorCst = builder.create<arith::ConstantOp>(
+      loc, builder.getIntegerAttr(dividend.getType(), divisor));
   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
   return builder.create<arith::DivUIOp>(loc, sum, divisorCst);
 }
@@ -285,9 +287,10 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
 // where divis is rounding-to-zero division.
 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
                              Value divisor) {
-  assert(dividend.getType().isIndex() && "expected index-typed value");
-
-  Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
+  assert(dividend.getType().isIntOrIndex() &&
+         "expected integer or index-typed value");
+  Value cstOne = builder.create<arith::ConstantOp>(
+      loc, builder.getOneAttr(dividend.getType()));
   Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
   return builder.create<arith::DivUIOp>(loc, sum, divisor);
@@ -409,16 +412,18 @@ LogicalResult mlir::loopUnrollByFactor(
     // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
     generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
     if (generateEpilogueLoop)
-      upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
-          loc, upperBoundUnrolledCst);
+      upperBoundUnrolled = boundsBuilder.create<arith::ConstantOp>(
+          loc, boundsBuilder.getIntegerAttr(forOp.getUpperBound().getType(),
+                                            upperBoundUnrolledCst));
     else
       upperBoundUnrolled = forOp.getUpperBound();
 
     // Create constant for 'stepUnrolled'.
     stepUnrolled = stepCst == stepUnrolledCst
                        ? step
-                       : boundsBuilder.create<arith::ConstantIndexOp>(
-                             loc, stepUnrolledCst);
+                       : boundsBuilder.create<arith::ConstantOp>(
+                             loc, boundsBuilder.getIntegerAttr(
+                                      step.getType(), stepUnrolledCst));
   } else {
     // Dynamic loop bounds computation.
     // TODO: Add dynamic asserts for negative lb/ub/step, or
@@ -428,8 +433,8 @@ LogicalResult mlir::loopUnrollByFactor(
     Value diff =
         boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
     Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
-    Value unrollFactorCst =
-        boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
+    Value unrollFactorCst = boundsBuilder.create<arith::ConstantOp>(
+        loc, boundsBuilder.getIntegerAttr(tripCount.getType(), unrollFactor));
     Value tripCountRem =
         boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
     // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
@@ -476,7 +481,9 @@ LogicalResult mlir::loopUnrollByFactor(
       [&](unsigned i, Value iv, OpBuilder b) {
         // iv' = iv + step * i;
         auto stride = b.create<arith::MulIOp>(
-            loc, step, b.create<arith::ConstantIndexOp>(loc, i));
+            loc, step,
+            b.create<arith::ConstantOp>(loc,
+                                        b.getIntegerAttr(iv.getType(), i)));
         return b.create<arith::AddIOp>(loc, iv, stride);
       },
       annotateFn, iterArgs, yieldedValues);
diff --git a/mlir/test/Dialect/SCF/loop-unroll.mlir b/mlir/test/Dialect/SCF/loop-unroll.mlir
index e28efbb6ec2b91..68a11fb6a72c64 100644
--- a/mlir/test/Dialect/SCF/loop-unroll.mlir
+++ b/mlir/test/Dialect/SCF/loop-unroll.mlir
@@ -448,3 +448,44 @@ func.func @loop_unroll_yield_iter_arg() {
 // CHECK-NEXT:      affine.yield %[[ITER_ARG]] : index
 // CHECK-NEXT:    }
 // CHECK-NEXT:    return
+
+// -----
+
+// Test the loop unroller works with integer IV type.
+func.func @static_loop_unroll_with_integer_iv() -> (f32, f32) {
+  %0 = arith.constant 7.0 : f32
+  %lb = arith.constant 0 : i32
+  %ub = arith.constant 20 : i32
+  %step = arith.constant 1 : i32
+  %result:2 = scf.for %i0 = %lb to %ub step %step iter_args(%arg0 = %0, %arg1 = %0) -> (f32, f32) : i32{
+    %add = arith.addf %arg0, %arg1 : f32
+    %mul = arith.mulf %arg0, %arg1 : f32
+    scf.yield %add, %mul : f32, f32
+  }
+  return %result#0, %result#1 : f32, f32
+}
+// UNROLL-BY-3-LABEL: func @static_loop_unroll_with_integer_iv
+//
+//   UNROLL-BY-3-DAG:   %[[CST:.*]] = arith.constant {{.*}} : f32
+//   UNROLL-BY-3-DAG:   %[[C0:.*]] = arith.constant 0 : i32
+//   UNROLL-BY-3-DAG:   %[[C1:.*]] = arith.constant 1 : i32
+//   UNROLL-BY-3-DAG:   %[[C20:.*]] = arith.constant 20 : i32
+//   UNROLL-BY-3-DAG:   %[[C18:.*]] = arith.constant 18 : i32
+//   UNROLL-BY-3-DAG:   %[[C3:.*]] = arith.constant 3 : i32
+//       UNROLL-BY-3:   %[[FOR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C18]] step %[[C3]]
+//  UNROLL-BY-3-SAME:     iter_args(%[[ARG0:.*]] = %[[CST]], %[[ARG1:.*]] = %[[CST]]) -> (f32, f32)  : i32 {
+//  UNROLL-BY-3-NEXT:     %[[ADD0:.*]] = arith.addf %[[ARG0]], %[[ARG1]] : f32
+//  UNROLL-BY-3-NEXT:     %[[MUL0:.*]] = arith.mulf %[[ARG0]], %[[ARG1]] : f32
+//  UNROLL-BY-3-NEXT:     %[[ADD1:.*]] = arith.addf %[[ADD0]], %[[MUL0]] : f32
+//  UNROLL-BY-3-NEXT:     %[[MUL1:.*]] = arith.mulf %[[ADD0]], %[[MUL0]] : f32
+//  UNROLL-BY-3-NEXT:     %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[MUL1]] : f32
+//  UNROLL-BY-3-NEXT:     %[[MUL2:.*]] = arith.mulf %[[ADD1]], %[[MUL1]] : f32
+//  UNROLL-BY-3-NEXT:     scf.yield %[[ADD2]], %[[MUL2]] : f32, f32
+//  UNROLL-BY-3-NEXT:   }
+//       UNROLL-BY-3:   %[[EFOR:.*]]:2 = scf.for %[[EIV:.*]] = %[[C18]] to %[[C20]] step %[[C1]]
+//  UNROLL-BY-3-SAME:     iter_args(%[[EARG0:.*]] = %[[FOR]]#0, %[[EARG1:.*]] = %[[FOR]]#1) -> (f32, f32)  : i32 {
+//  UNROLL-BY-3-NEXT:     %[[EADD:.*]] = arith.addf %[[EARG0]], %[[EARG1]] : f32
+//  UNROLL-BY-3-NEXT:     %[[EMUL:.*]] = arith.mulf %[[EARG0]], %[[EARG1]] : f32
+//  UNROLL-BY-3-NEXT:     scf.yield %[[EADD]], %[[EMUL]] : f32, f32
+//  UNROLL-BY-3-NEXT:   }
+//  UNROLL-BY-3-NEXT:   return %[[EFOR]]#0, %[[EFOR]]#1 : f32, f32

From 115b87636b9f84cf145c0c96859f8e9f5e76c7a1 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 29 Aug 2024 09:28:45 -0700
Subject: [PATCH 12/72] [NFC][Support] Eliminate ',' at end of MemoryEffects
 print (#106545)

- Eliminate comma at end of a MemoryEffects print.
- Added basic unit test to validate that.
---
 llvm/lib/Support/ModRef.cpp           |  7 ++++---
 llvm/unittests/Support/CMakeLists.txt |  1 +
 llvm/unittests/Support/ModRefTest.cpp | 28 +++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 llvm/unittests/Support/ModRefTest.cpp

diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp
index c5978296e97f0c..b57ea30b93832f 100644
--- a/llvm/lib/Support/ModRef.cpp
+++ b/llvm/lib/Support/ModRef.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ModRef.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace llvm;
 
@@ -33,7 +34,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, ModRefInfo MR) {
 }
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) {
-  for (IRMemLocation Loc : MemoryEffects::locations()) {
+  interleaveComma(MemoryEffects::locations(), OS, [&](IRMemLocation Loc) {
     switch (Loc) {
     case IRMemLocation::ArgMem:
       OS << "ArgMem: ";
@@ -45,7 +46,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) {
       OS << "Other: ";
       break;
     }
-    OS << ME.getModRef(Loc) << ", ";
-  }
+    OS << ME.getModRef(Loc);
+  });
   return OS;
 }
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index db47a170e814a6..d64f89847aa8e7 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -61,6 +61,7 @@ add_llvm_unittest(SupportTests
   MemoryBufferRefTest.cpp
   MemoryBufferTest.cpp
   MemoryTest.cpp
+  ModRefTest.cpp
   NativeFormatTests.cpp
   OptimizedStructLayoutTest.cpp
   ParallelTest.cpp
diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp
new file mode 100644
index 00000000000000..5ebb5f6a41a586
--- /dev/null
+++ b/llvm/unittests/Support/ModRefTest.cpp
@@ -0,0 +1,28 @@
+//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ModRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+// Verify that printing a MemoryEffects does not end with a ,.
+TEST(ModRefTest, PrintMemoryEffects) {
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << MemoryEffects::none();
+  OS.flush();
+  EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef");
+}
+
+} // namespace

From 81acc84997d6d5d5c7f8e3b3e6d8d01d567d0e1c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Aug 2024 15:58:36 +0100
Subject: [PATCH 13/72] [LoopVectorize][X86] amdlibm-calls.ll - add 2/4/8/16
 vector widths test checks for fallback to llvm intrinsics

Check for cases where there isn't a amdlib call but it still vectorises the math call
---
 .../LoopVectorize/X86/amdlibm-calls.ll        | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 1ec6a4febf18a8..04289d43f40e2f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -80,6 +80,7 @@ define void @sin_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.sin.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -102,6 +103,7 @@ for.end:
 
 define void @sin_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_sinf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]])
@@ -130,6 +132,7 @@ define void @sin_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.sin.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -152,6 +155,7 @@ for.end:
 
 define void @sin_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_sinf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]])
@@ -180,6 +184,7 @@ define void @cos_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cos(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.cos.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -202,6 +207,7 @@ for.end:
 
 define void @cos_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_cosf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]])
@@ -230,6 +236,7 @@ define void @cos_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cos(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.cos.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -252,6 +259,7 @@ for.end:
 
 define void @cos_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_cosf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]])
@@ -280,6 +288,7 @@ define void @tan_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.tan.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -302,6 +311,7 @@ for.end:
 
 define void @tan_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @tan_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]])
@@ -330,6 +340,7 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.tan.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -352,6 +363,7 @@ for.end:
 
 define void @tan_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @tan_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]])
@@ -770,6 +782,7 @@ define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-VF2:    [[TMP8:%.*]] = call <2 x double> @amd_vrd2_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[WIDE_LOAD:%.*]])
 ; CHECK-VF4:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK-VF8:    [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]])
+; CHECK-VF16:   [[TMP8:%.*]] = call <16 x double> @llvm.pow.v16f64(<16 x double> [[TMP4:%.*]], <16 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -797,6 +810,7 @@ define void @pow_f64_intrinsic(ptr nocapture %varray, ptr nocapture readonly %ex
 ; CHECK-VF2:    [[TMP8:%.*]] = call <2 x double> @amd_vrd2_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[WIDE_LOAD:%.*]])
 ; CHECK-VF4:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK-VF8:    [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]])
+; CHECK-VF16:   [[TMP8:%.*]] = call <16 x double> @llvm.pow.v16f64(<16 x double> [[TMP4:%.*]], <16 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -821,6 +835,7 @@ for.end:
 
 define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32(
+; CHECK-VF2:    [[TMP8:%.*]] = call <2 x float> @llvm.pow.v2f32(<2 x float> [[TMP4:%.*]], <2 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF4:    [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF8:    [[TMP8:%.*]] = call <8 x float> @amd_vrs8_powf(<8 x float> [[TMP4:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF16:   [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]])
@@ -848,6 +863,7 @@ for.end:
 
 define void @pow_f32_intrinsic(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK-VF2:    [[TMP8:%.*]] = call <2 x float> @llvm.pow.v2f32(<2 x float> [[TMP4:%.*]], <2 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF4:    [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF8:    [[TMP8:%.*]] = call <8 x float> @amd_vrs8_powf(<8 x float> [[TMP4:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-VF16:   [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]])
@@ -878,6 +894,7 @@ define void @exp_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.exp.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -900,6 +917,7 @@ for.end:
 
 define void @exp_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_expf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]])
@@ -928,6 +946,7 @@ define void @exp_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.exp.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -950,6 +969,7 @@ for.end:
 
 define void @exp_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_expf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]])
@@ -978,6 +998,7 @@ define void @log_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.log.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1000,6 +1021,7 @@ for.end:
 
 define void @log_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_logf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]])
@@ -1028,6 +1050,7 @@ define void @log_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.log.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1050,6 +1073,7 @@ for.end:
 
 define void @log_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_logf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]])
@@ -1078,6 +1102,7 @@ define void @log2_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log2(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.log2.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1100,6 +1125,7 @@ for.end:
 
 define void @log2_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log2_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log2.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log2f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]])
@@ -1128,6 +1154,7 @@ define void @log2_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log2(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.log2.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1150,6 +1177,7 @@ for.end:
 
 define void @log2_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @log2_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log2.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log2f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]])
@@ -1175,6 +1203,7 @@ for.end:
 
 define void @log10_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log10_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log10f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]])
@@ -1200,6 +1229,7 @@ for.end:
 
 define void @log10_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @log10_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log10f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]])
@@ -1228,6 +1258,7 @@ define void @exp2_f64(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp2(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.exp2.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1250,6 +1281,7 @@ for.end:
 
 define void @exp2_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp2f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]])
@@ -1278,6 +1310,7 @@ define void @exp2_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp2(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.exp2.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -1300,6 +1333,7 @@ for.end:
 
 define void @exp2_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp2f(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]])

From a777a93118a0ca71e19ac764a57a94f1be227dbb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Aug 2024 17:24:05 +0100
Subject: [PATCH 14/72] Fix MSVC "not all control paths return a value"
 warning. NFC.

---
 llvm/utils/TableGen/IntrinsicEmitter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 06b430ae011405..70ccecf7752af7 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -451,6 +451,7 @@ static AttributeSet getIntrinsicArgAttributeSet(LLVMContext &C, unsigned ID) {
         case CodeGenIntrinsic::Dereferenceable:
           return "Dereferenceable";
         }
+        llvm_unreachable("Unknown CodeGenIntrinsic::ArgAttrKind enum");
       };
 
       OS << formatv(R"(

From bd6531b9508624df83f84d9bc687a7339df452e9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 29 Aug 2024 09:45:19 -0700
Subject: [PATCH 15/72] [LTO] Introduce a new class ImportIDTable (#106503)

The new class implements a deduplication table to convert import list
elements:

  {SourceModule, GUID, Definition/Declaration}

into 32-bit integers, and vice versa.  This patch adds a unit test but
does not add a use yet.

To be precise, the deduplication table holds {SourceModule, GUID}
pairs.  We use the bottom one bit of the 32-bit integers to indicate
whether we have a definition or declaration.

A subsequent patch will collapse the import list hierarchy --
FunctionsToImportTy holding many instances of FunctionsToImportTy --
down to DenseSet<uint32_t> with each element indexing into the
deduplication table above.  This will address multiple sources of
space inefficiency.
---
 .../llvm/Transforms/IPO/FunctionImport.h      | 71 +++++++++++++++
 llvm/unittests/Transforms/IPO/CMakeLists.txt  |  1 +
 .../Transforms/IPO/ImportIDTableTests.cpp     | 91 +++++++++++++++++++
 3 files changed, 163 insertions(+)
 create mode 100644 llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index b7280c56be9cc8..99fe110191dec9 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -10,6 +10,7 @@
 #define LLVM_TRANSFORMS_IPO_FUNCTIONIMPORT_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
@@ -96,6 +97,76 @@ class FunctionImporter {
                std::tuple<unsigned, const GlobalValueSummary *,
                           std::unique_ptr<ImportFailureInfo>>>;
 
+  // Issues import IDs.  Each ID uniquely corresponds to a tuple of
+  // (FromModule, GUID, Definition/Declaration).
+  //
+  // The import IDs make the import list space efficient by referring to each
+  // import with a 32-bit integer ID while maintaining a central table that maps
+  // those integer IDs to tuples of (FromModule, GUID, Def/Decl).
+  //
+  // In one large application, a pair of (FromModule, GUID) is mentioned in
+  // import lists more than 50 times on average across all destination modules.
+  // Mentioning the 32-byte tuple:
+  //
+  // std::tuple<StringRef, GlobalValue::GUID, GlobalValueSummary::ImportKind>
+  //
+  // 50 times by value in various import lists would be costly.  We can reduce
+  // the memory footprint of import lists by placing one copy in a central table
+  // and referring to it with 32-bit integer IDs.
+  //
+  // To save space within the central table, we only store pairs of
+  // (FromModule, GUID) in the central table.  In the actual 32-bit integer ID,
+  // the top 31 bits index into the central table while the bottom 1 bit
+  // indicates whether an ID is for GlobalValueSummary::Declaration or
+  // GlobalValueSummary::Definition.
+  class ImportIDTable {
+  public:
+    using ImportIDTy = uint32_t;
+
+    // Create a pair of import IDs [Def, Decl] for a given pair of FromModule
+    // and GUID.
+    std::pair<ImportIDTy, ImportIDTy> createImportIDs(StringRef FromModule,
+                                                      GlobalValue::GUID GUID) {
+      auto Key = std::make_pair(FromModule, GUID);
+      auto InsertResult = TheTable.try_emplace(Key, TheTable.size());
+      return makeIDPair(InsertResult.first->second);
+    }
+
+    // Get a pair of previously created import IDs [Def, Decl] for a given pair
+    // of FromModule and GUID.  Returns std::nullopt if not available.
+    std::optional<std::pair<ImportIDTy, ImportIDTy>>
+    getImportIDs(StringRef FromModule, GlobalValue::GUID GUID) {
+      auto Key = std::make_pair(FromModule, GUID);
+      auto It = TheTable.find(Key);
+      if (It != TheTable.end())
+        return makeIDPair(It->second);
+      return std::nullopt;
+    }
+
+    // Return a tuple of [FromModule, GUID, Def/Decl] that a given ImportID
+    // corresponds to.
+    std::tuple<StringRef, GlobalValue::GUID, GlobalValueSummary::ImportKind>
+    lookup(ImportIDTy ImportID) const {
+      GlobalValueSummary::ImportKind Kind =
+          (ImportID & 1) ? GlobalValueSummary::Declaration
+                         : GlobalValueSummary::Definition;
+      auto It = TheTable.begin() + (ImportID >> 1);
+      StringRef FromModule = It->first.first;
+      GlobalValue::GUID GUID = It->first.second;
+      return std::make_tuple(FromModule, GUID, Kind);
+    }
+
+  private:
+    // Make a pair of import IDs [Def, Decl] from an index into TheTable.
+    static std::pair<ImportIDTy, ImportIDTy> makeIDPair(ImportIDTy Index) {
+      ImportIDTy Def = Index << 1;
+      ImportIDTy Decl = Def | 1;
+      return std::make_pair(Def, Decl);
+    }
+
+    MapVector<std::pair<StringRef, GlobalValue::GUID>, ImportIDTy> TheTable;
+  };
+
   /// The map maintains the list of imports.  Conceptually, it is a collection
   /// of tuples of the form:
   ///
diff --git a/llvm/unittests/Transforms/IPO/CMakeLists.txt b/llvm/unittests/Transforms/IPO/CMakeLists.txt
index ac632450d57305..65be2966bec049 100644
--- a/llvm/unittests/Transforms/IPO/CMakeLists.txt
+++ b/llvm/unittests/Transforms/IPO/CMakeLists.txt
@@ -13,4 +13,5 @@ add_llvm_unittest(IPOTests
   WholeProgramDevirt.cpp
   AttributorTest.cpp
   FunctionSpecializationTest.cpp
+  ImportIDTableTests.cpp
   )
diff --git a/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp b/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp
new file mode 100644
index 00000000000000..09e0a25ec9b93a
--- /dev/null
+++ b/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp
@@ -0,0 +1,91 @@
+//===- ImportIDTableTests.cpp - Unit tests for ImportIDTable --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <set>
+#include <type_traits>
+
+using namespace llvm;
+
+TEST(ImportIDTableTests, Basic) {
+  FunctionImporter::ImportIDTable Table;
+
+  auto [Def, Decl] = Table.createImportIDs("mod", 123U);
+  auto [Def2, Decl2] = Table.createImportIDs("stuff", 456U);
+
+  // Def and Decl must be of the same unsigned integer type.
+  static_assert(
+      std::is_unsigned_v<FunctionImporter::ImportIDTable::ImportIDTy>);
+  static_assert(std::is_same_v<FunctionImporter::ImportIDTable::ImportIDTy,
+                               decltype(Def)>);
+  static_assert(std::is_same_v<FunctionImporter::ImportIDTable::ImportIDTy,
+                               decltype(Decl)>);
+
+  // Check that all IDs are unique.
+  std::set<FunctionImporter::ImportIDTable::ImportIDTy> IDs = {Def, Decl, Def2,
+                                                               Decl2};
+  EXPECT_THAT(IDs, ::testing::SizeIs(4));
+
+  // Verify what Def maps to.
+  auto DefTuple = Table.lookup(Def);
+  EXPECT_EQ(std::get<0>(DefTuple), StringRef("mod"));
+  EXPECT_EQ(std::get<1>(DefTuple), 123U);
+  EXPECT_EQ(std::get<2>(DefTuple), GlobalValueSummary::Definition);
+
+  // Verify what Decl maps to.
+  auto DeclTuple = Table.lookup(Decl);
+  EXPECT_EQ(std::get<0>(DeclTuple), StringRef("mod"));
+  EXPECT_EQ(std::get<1>(DeclTuple), 123U);
+  EXPECT_EQ(std::get<2>(DeclTuple), GlobalValueSummary::Declaration);
+
+  // Verify what Def2 maps to.
+  auto Def2Tuple = Table.lookup(Def2);
+  EXPECT_EQ(std::get<0>(Def2Tuple), StringRef("stuff"));
+  EXPECT_EQ(std::get<1>(Def2Tuple), 456U);
+  EXPECT_EQ(std::get<2>(Def2Tuple), GlobalValueSummary::Definition);
+
+  // Verify what Decl2 maps to.
+  auto Decl2Tuple = Table.lookup(Decl2);
+  EXPECT_EQ(std::get<0>(Decl2Tuple), StringRef("stuff"));
+  EXPECT_EQ(std::get<1>(Decl2Tuple), 456U);
+  EXPECT_EQ(std::get<2>(Decl2Tuple), GlobalValueSummary::Declaration);
+}
+
+TEST(ImportIDTableTests, Duplicates) {
+  FunctionImporter::ImportIDTable Table;
+
+  auto [Def1, Decl1] = Table.createImportIDs("mod", 123U);
+  auto [Def2, Decl2] = Table.createImportIDs("mod", 123U);
+
+  // Verify we get the same IDs back.
+  EXPECT_EQ(Def1, Def2);
+  EXPECT_EQ(Decl1, Decl2);
+}
+
+TEST(ImportIDTableTests, Present) {
+  FunctionImporter::ImportIDTable Table;
+
+  auto [Def, Decl] = Table.createImportIDs("mod", 123U);
+  auto Result = Table.getImportIDs("mod", 123U);
+
+  // Verify that we get the same IDs back.
+  ASSERT_NE(Result, std::nullopt);
+  EXPECT_EQ(Result->first, Def);
+  EXPECT_EQ(Result->second, Decl);
+}
+
+TEST(ImportIDTableTests, Missing) {
+  FunctionImporter::ImportIDTable Table;
+
+  auto Result = Table.getImportIDs("mod", 123U);
+
+  // Verify that we get std::nullopt for a non-existent pair.
+  EXPECT_EQ(Result, std::nullopt);
+}

From 59f05b683def5ef728baf8c4ae8f846e957ad67f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 09:45:47 -0700
Subject: [PATCH 16/72] [RISCV][TTI] Model cost for insert/extract into illegal
 types (#106440)

We'd previously just deferred to the base implementation, but that more
or less always returns 1. This underestimates the cost of the
insert/extract, biases the SLP vectorizer towards forming illegally
typed vectors, and underestimates the cost of scalarized operations
(like unaligned scatter/gather).
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   3 -
 .../CostModel/RISCV/fixed-vector-gather.ll    |  36 +--
 .../CostModel/RISCV/fixed-vector-scatter.ll   |  36 +--
 .../CostModel/RISCV/rvv-extractelement.ll     | 266 +++++++++---------
 .../CostModel/RISCV/rvv-insertelement.ll      | 250 ++++++++--------
 5 files changed, 294 insertions(+), 297 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 537c62bb0aacd1..bb8e162f57dfcd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1626,9 +1626,6 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   if (LT.second.isScalableVector() && !LT.first.isValid())
     return LT.first;
 
-  if (!isTypeLegal(Val))
-    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
-
   // Mask vector extract/insert is expanded via e8.
   if (Val->getScalarSizeInBits() == 1) {
     VectorType *WideTy =
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
index ec7eb81d98bf91..f37cd99e803ec9 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
@@ -42,34 +42,34 @@ define i32 @masked_gather() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
index 6da9d8d73cbd4c..ed15493c5e1e77 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
@@ -42,34 +42,34 @@ define i32 @masked_scatter() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
index c9145784afcc1c..4a9e30888cdd1f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
@@ -61,7 +61,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_0 = extractelement <vscale x 2 x i64> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_0 = extractelement <vscale x 4 x i64> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_0 = extractelement <vscale x 8 x i64> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = extractelement <vscale x 16 x i64> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_0 = extractelement <vscale x 16 x i64> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
@@ -85,7 +85,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement <vscale x 16 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement <vscale x 32 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement <vscale x 64 x i8> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1
@@ -97,7 +97,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement <vscale x 8 x i16> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement <vscale x 16 x i16> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement <vscale x 32 x i16> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1
@@ -107,7 +107,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement <vscale x 4 x i32> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement <vscale x 8 x i32> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement <vscale x 16 x i32> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1
@@ -115,7 +115,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_1 = extractelement <vscale x 2 x i64> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_1 = extractelement <vscale x 4 x i64> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_1 = extractelement <vscale x 8 x i64> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
@@ -139,7 +139,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement <vscale x 16 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement <vscale x 32 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement <vscale x 64 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x
@@ -151,7 +151,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement <vscale x 8 x i16> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement <vscale x 16 x i16> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement <vscale x 32 x i16> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x
@@ -161,7 +161,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement <vscale x 4 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement <vscale x 8 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement <vscale x 16 x i32> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x
@@ -169,7 +169,7 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = extractelement <vscale x 2 x i64> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = extractelement <vscale x 4 x i64> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = extractelement <vscale x 8 x i64> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'extractelement_int'
@@ -250,7 +250,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement <vscale x 16 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement <vscale x 32 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement <vscale x 64 x i8> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1
@@ -262,7 +262,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement <vscale x 8 x i16> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement <vscale x 16 x i16> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement <vscale x 32 x i16> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1
@@ -272,7 +272,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement <vscale x 4 x i32> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement <vscale x 8 x i32> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement <vscale x 16 x i32> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1
@@ -280,7 +280,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = extractelement <vscale x 2 x i64> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = extractelement <vscale x 4 x i64> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = extractelement <vscale x 8 x i64> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
@@ -304,7 +304,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement <vscale x 16 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement <vscale x 32 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement <vscale x 64 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x
@@ -316,7 +316,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement <vscale x 8 x i16> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement <vscale x 16 x i16> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement <vscale x 32 x i16> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x
@@ -326,7 +326,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement <vscale x 4 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement <vscale x 8 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement <vscale x 16 x i32> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x
@@ -334,7 +334,7 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_x = extractelement <vscale x 2 x i64> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_x = extractelement <vscale x 4 x i64> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_x = extractelement <vscale x 8 x i64> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'extractelement_int'
@@ -387,11 +387,11 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_0 = extractelement <16 x i64> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_0 = extractelement <16 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_0 = extractelement <vscale x 2 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_0 = extractelement <vscale x 4 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_0 = extractelement <vscale x 8 x i64> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = extractelement <vscale x 16 x i64> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_0 = extractelement <vscale x 16 x i64> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
@@ -408,44 +408,44 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = extractelement <16 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = extractelement <32 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = extractelement <64 x i8> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = extractelement <vscale x 2 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = extractelement <vscale x 4 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = extractelement <vscale x 8 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement <vscale x 16 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement <vscale x 32 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement <vscale x 64 x i8> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = extractelement <16 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = extractelement <32 x i16> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = extractelement <vscale x 2 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = extractelement <vscale x 4 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement <vscale x 8 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement <vscale x 16 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement <vscale x 32 x i16> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = extractelement <16 x i32> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = extractelement <vscale x 2 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement <vscale x 4 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement <vscale x 8 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement <vscale x 16 x i32> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_1 = extractelement <vscale x 2 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_1 = extractelement <vscale x 4 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_1 = extractelement <vscale x 8 x i64> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
@@ -462,44 +462,44 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_x = extractelement <16 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_x = extractelement <32 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_x = extractelement <64 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_x = extractelement <vscale x 2 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_x = extractelement <vscale x 4 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_x = extractelement <vscale x 8 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement <vscale x 16 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement <vscale x 32 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement <vscale x 64 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_x = extractelement <16 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_x = extractelement <32 x i16> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_x = extractelement <vscale x 2 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_x = extractelement <vscale x 4 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement <vscale x 8 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement <vscale x 16 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement <vscale x 32 x i16> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_x = extractelement <16 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_x = extractelement <vscale x 2 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement <vscale x 4 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement <vscale x 8 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement <vscale x 16 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = extractelement <vscale x 2 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = extractelement <vscale x 4 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = extractelement <vscale x 8 x i64> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'extractelement_int'
@@ -573,44 +573,44 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = extractelement <16 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = extractelement <32 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = extractelement <64 x i8> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = extractelement <vscale x 2 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = extractelement <vscale x 4 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = extractelement <vscale x 8 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement <vscale x 16 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement <vscale x 32 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement <vscale x 64 x i8> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement <vscale x 128 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = extractelement <16 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = extractelement <32 x i16> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = extractelement <vscale x 2 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = extractelement <vscale x 4 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement <vscale x 8 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement <vscale x 16 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement <vscale x 32 x i16> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement <vscale x 64 x i16> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = extractelement <16 x i32> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = extractelement <vscale x 2 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement <vscale x 4 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement <vscale x 8 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement <vscale x 16 x i32> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement <vscale x 32 x i32> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = extractelement <vscale x 2 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = extractelement <vscale x 4 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = extractelement <vscale x 8 x i64> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement <vscale x 16 x i64> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
@@ -627,44 +627,44 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_x = extractelement <16 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_x = extractelement <32 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_x = extractelement <64 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_x = extractelement <vscale x 2 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_x = extractelement <vscale x 4 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_x = extractelement <vscale x 8 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement <vscale x 16 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement <vscale x 32 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement <vscale x 64 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement <vscale x 128 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_x = extractelement <16 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_x = extractelement <32 x i16> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_x = extractelement <vscale x 2 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_x = extractelement <vscale x 4 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement <vscale x 8 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement <vscale x 16 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement <vscale x 32 x i16> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement <vscale x 64 x i16> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_x = extractelement <16 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_x = extractelement <vscale x 2 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement <vscale x 4 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement <vscale x 8 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement <vscale x 16 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement <vscale x 32 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_x = extractelement <vscale x 2 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_x = extractelement <vscale x 4 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_x = extractelement <vscale x 8 x i64> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement <vscale x 16 x i64> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i1_0 = extractelement <2 x i1> undef, i32 0
@@ -868,68 +868,68 @@ define void @extractelement_int_lmul(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'extractelement_int_lmul'
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'extractelement_int_lmul'
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'extractelement_int_lmul'
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v128i8_31 = extractelement <128 x i8> undef, i32 31
@@ -997,7 +997,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = extractelement <vscale x 8 x half> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = extractelement <vscale x 16 x half> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = extractelement <vscale x 32 x half> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = extractelement <vscale x 64 x half> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = extractelement <vscale x 64 x half> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = extractelement <2 x float> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = extractelement <4 x float> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = extractelement <8 x float> undef, i32 1
@@ -1007,7 +1007,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = extractelement <vscale x 4 x float> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = extractelement <vscale x 8 x float> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = extractelement <vscale x 16 x float> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = extractelement <vscale x 32 x float> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = extractelement <vscale x 32 x float> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = extractelement <4 x double> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = extractelement <8 x double> undef, i32 1
@@ -1015,7 +1015,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = extractelement <vscale x 2 x double> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = extractelement <vscale x 4 x double> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = extractelement <vscale x 8 x double> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = extractelement <vscale x 16 x double> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = extractelement <vscale x 16 x double> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_x = extractelement <2 x half> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_x = extractelement <4 x half> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_x = extractelement <8 x half> undef, i32 %x
@@ -1027,7 +1027,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_x = extractelement <vscale x 8 x half> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_x = extractelement <vscale x 16 x half> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_x = extractelement <vscale x 32 x half> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = extractelement <vscale x 64 x half> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_x = extractelement <vscale x 64 x half> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_x = extractelement <2 x float> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_x = extractelement <4 x float> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_x = extractelement <8 x float> undef, i32 %x
@@ -1037,7 +1037,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_x = extractelement <vscale x 4 x float> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_x = extractelement <vscale x 8 x float> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_x = extractelement <vscale x 16 x float> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = extractelement <vscale x 32 x float> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_x = extractelement <vscale x 32 x float> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_x = extractelement <2 x double> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_x = extractelement <4 x double> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_x = extractelement <8 x double> undef, i32 %x
@@ -1045,7 +1045,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_x = extractelement <vscale x 2 x double> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_x = extractelement <vscale x 4 x double> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_x = extractelement <vscale x 8 x double> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = extractelement <vscale x 16 x double> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_x = extractelement <vscale x 16 x double> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'extractelement_fp'
@@ -1090,7 +1090,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = extractelement <vscale x 8 x half> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = extractelement <vscale x 16 x half> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = extractelement <vscale x 32 x half> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = extractelement <vscale x 64 x half> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = extractelement <vscale x 64 x half> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = extractelement <2 x float> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = extractelement <4 x float> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = extractelement <8 x float> undef, i32 1
@@ -1100,7 +1100,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = extractelement <vscale x 4 x float> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = extractelement <vscale x 8 x float> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = extractelement <vscale x 16 x float> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = extractelement <vscale x 32 x float> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = extractelement <vscale x 32 x float> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = extractelement <4 x double> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = extractelement <8 x double> undef, i32 1
@@ -1108,7 +1108,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = extractelement <vscale x 2 x double> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = extractelement <vscale x 4 x double> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = extractelement <vscale x 8 x double> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = extractelement <vscale x 16 x double> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = extractelement <vscale x 16 x double> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_x = extractelement <2 x half> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_x = extractelement <4 x half> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_x = extractelement <8 x half> undef, i32 %x
@@ -1120,7 +1120,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_x = extractelement <vscale x 8 x half> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_x = extractelement <vscale x 16 x half> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_x = extractelement <vscale x 32 x half> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = extractelement <vscale x 64 x half> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_x = extractelement <vscale x 64 x half> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_x = extractelement <2 x float> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_x = extractelement <4 x float> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_x = extractelement <8 x float> undef, i32 %x
@@ -1130,7 +1130,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_x = extractelement <vscale x 4 x float> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_x = extractelement <vscale x 8 x float> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_x = extractelement <vscale x 16 x float> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = extractelement <vscale x 32 x float> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_x = extractelement <vscale x 32 x float> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_x = extractelement <2 x double> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_x = extractelement <4 x double> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_x = extractelement <8 x double> undef, i32 %x
@@ -1138,7 +1138,7 @@ define void @extractelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_x = extractelement <vscale x 2 x double> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_x = extractelement <vscale x 4 x double> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_x = extractelement <vscale x 8 x double> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = extractelement <vscale x 16 x double> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_x = extractelement <vscale x 16 x double> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'extractelement_fp'
@@ -1441,50 +1441,50 @@ define void @extractelement_fp(i32 %x) {
 
 define void @extractelement_int_nonpoweroftwo(i32 %x) {
 ; RV32V-LABEL: 'extractelement_int_nonpoweroftwo'
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'extractelement_int_nonpoweroftwo'
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'extractelement_int_nonpoweroftwo'
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'extractelement_int_nonpoweroftwo'
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
index c167adf1f33977..0616e0919b9d9a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
@@ -61,7 +61,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_0 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_0 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_0 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_0 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
@@ -85,7 +85,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement <vscale x 16 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement <vscale x 32 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement <vscale x 64 x i8> undef, i8 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1
@@ -97,7 +97,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement <vscale x 8 x i16> undef, i16 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement <vscale x 16 x i16> undef, i16 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement <vscale x 32 x i16> undef, i16 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1
@@ -107,7 +107,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement <vscale x 4 x i32> undef, i32 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement <vscale x 8 x i32> undef, i32 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement <vscale x 16 x i32> undef, i32 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1
@@ -115,7 +115,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_1 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_1 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_1 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
@@ -139,7 +139,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement <vscale x 16 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement <vscale x 32 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement <vscale x 64 x i8> undef, i8 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x
@@ -151,7 +151,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement <vscale x 8 x i16> undef, i16 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement <vscale x 16 x i16> undef, i16 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement <vscale x 32 x i16> undef, i16 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x
@@ -161,7 +161,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement <vscale x 4 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement <vscale x 8 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement <vscale x 16 x i32> undef, i32 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x
@@ -169,7 +169,7 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = insertelement <vscale x 2 x i64> undef, i64 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = insertelement <vscale x 4 x i64> undef, i64 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = insertelement <vscale x 8 x i64> undef, i64 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'insertelement_int'
@@ -250,7 +250,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement <vscale x 16 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement <vscale x 32 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement <vscale x 64 x i8> undef, i8 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1
@@ -262,7 +262,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement <vscale x 8 x i16> undef, i16 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement <vscale x 16 x i16> undef, i16 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement <vscale x 32 x i16> undef, i16 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1
@@ -272,7 +272,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement <vscale x 4 x i32> undef, i32 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement <vscale x 8 x i32> undef, i32 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement <vscale x 16 x i32> undef, i32 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1
@@ -280,7 +280,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
@@ -304,7 +304,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement <vscale x 16 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement <vscale x 32 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement <vscale x 64 x i8> undef, i8 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x
@@ -316,7 +316,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement <vscale x 8 x i16> undef, i16 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement <vscale x 16 x i16> undef, i16 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement <vscale x 32 x i16> undef, i16 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x
@@ -326,7 +326,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement <vscale x 4 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement <vscale x 8 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement <vscale x 16 x i32> undef, i32 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x
@@ -334,7 +334,7 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_x = insertelement <vscale x 2 x i64> undef, i64 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_x = insertelement <vscale x 4 x i64> undef, i64 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_x = insertelement <vscale x 8 x i64> undef, i64 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'insertelement_int'
@@ -387,11 +387,11 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_0 = insertelement <16 x i64> undef, i64 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_0 = insertelement <16 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_0 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_0 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_0 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_0 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
@@ -408,44 +408,44 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = insertelement <16 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = insertelement <32 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = insertelement <64 x i8> undef, i8 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = insertelement <vscale x 2 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = insertelement <vscale x 4 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = insertelement <vscale x 8 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement <vscale x 16 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement <vscale x 32 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement <vscale x 64 x i8> undef, i8 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = insertelement <16 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = insertelement <32 x i16> undef, i16 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = insertelement <vscale x 2 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = insertelement <vscale x 4 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement <vscale x 8 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement <vscale x 16 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement <vscale x 32 x i16> undef, i16 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = insertelement <16 x i32> undef, i32 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = insertelement <vscale x 2 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement <vscale x 4 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement <vscale x 8 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement <vscale x 16 x i32> undef, i32 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_1 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_1 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_1 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
@@ -462,44 +462,44 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_x = insertelement <16 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_x = insertelement <32 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_x = insertelement <64 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_x = insertelement <vscale x 2 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_x = insertelement <vscale x 4 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i8_x = insertelement <vscale x 8 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement <vscale x 16 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement <vscale x 32 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement <vscale x 64 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_x = insertelement <16 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_x = insertelement <32 x i16> undef, i16 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_x = insertelement <vscale x 2 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i16_x = insertelement <vscale x 4 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement <vscale x 8 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement <vscale x 16 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement <vscale x 32 x i16> undef, i16 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_x = insertelement <16 x i32> undef, i32 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i32_x = insertelement <vscale x 2 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement <vscale x 4 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement <vscale x 8 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement <vscale x 16 x i32> undef, i32 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = insertelement <vscale x 2 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = insertelement <vscale x 4 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = insertelement <vscale x 8 x i64> undef, i64 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'insertelement_int'
@@ -573,44 +573,44 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = insertelement <16 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = insertelement <32 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = insertelement <64 x i8> undef, i8 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = insertelement <vscale x 2 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = insertelement <vscale x 4 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = insertelement <vscale x 8 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement <vscale x 16 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement <vscale x 32 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement <vscale x 64 x i8> undef, i8 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement <vscale x 128 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = insertelement <16 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = insertelement <32 x i16> undef, i16 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = insertelement <vscale x 2 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = insertelement <vscale x 4 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement <vscale x 8 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement <vscale x 16 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement <vscale x 32 x i16> undef, i16 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement <vscale x 64 x i16> undef, i16 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = insertelement <16 x i32> undef, i32 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = insertelement <vscale x 2 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement <vscale x 4 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement <vscale x 8 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement <vscale x 16 x i32> undef, i32 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement <vscale x 32 x i32> undef, i32 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = insertelement <vscale x 2 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = insertelement <vscale x 4 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = insertelement <vscale x 8 x i64> undef, i64 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement <vscale x 16 x i64> undef, i64 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
@@ -627,44 +627,44 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_x = insertelement <16 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_x = insertelement <32 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_x = insertelement <64 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_x = insertelement <vscale x 2 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_x = insertelement <vscale x 4 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i8_x = insertelement <vscale x 8 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement <vscale x 16 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement <vscale x 32 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement <vscale x 64 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement <vscale x 128 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_x = insertelement <16 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_x = insertelement <32 x i16> undef, i16 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_x = insertelement <vscale x 2 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i16_x = insertelement <vscale x 4 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement <vscale x 8 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement <vscale x 16 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement <vscale x 32 x i16> undef, i16 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement <vscale x 64 x i16> undef, i16 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_x = insertelement <16 x i32> undef, i32 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i32_x = insertelement <vscale x 2 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement <vscale x 4 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement <vscale x 8 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement <vscale x 16 x i32> undef, i32 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement <vscale x 32 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_x = insertelement <vscale x 2 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_x = insertelement <vscale x 4 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_x = insertelement <vscale x 8 x i64> undef, i64 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_x = insertelement <vscale x 16 x i64> undef, i64 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0
@@ -868,68 +868,68 @@ define void @insertelement_int_lmul(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'insertelement_int_lmul'
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'insertelement_int_lmul'
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'insertelement_int_lmul'
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31
@@ -997,7 +997,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = insertelement <vscale x 8 x half> undef, half undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = insertelement <vscale x 16 x half> undef, half undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = insertelement <vscale x 32 x half> undef, half undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = insertelement <vscale x 64 x half> undef, half undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = insertelement <vscale x 64 x half> undef, half undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = insertelement <4 x float> undef, float undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = insertelement <8 x float> undef, float undef, i32 1
@@ -1007,7 +1007,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = insertelement <vscale x 4 x float> undef, float undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = insertelement <vscale x 8 x float> undef, float undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = insertelement <vscale x 16 x float> undef, float undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = insertelement <vscale x 32 x float> undef, float undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = insertelement <vscale x 32 x float> undef, float undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = insertelement <4 x double> undef, double undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = insertelement <8 x double> undef, double undef, i32 1
@@ -1015,7 +1015,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = insertelement <vscale x 2 x double> undef, double undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = insertelement <vscale x 4 x double> undef, double undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = insertelement <vscale x 8 x double> undef, double undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = insertelement <vscale x 16 x double> undef, double undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = insertelement <vscale x 16 x double> undef, double undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_x = insertelement <2 x half> undef, half undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_x = insertelement <4 x half> undef, half undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_x = insertelement <8 x half> undef, half undef, i32 %x
@@ -1027,7 +1027,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_x = insertelement <vscale x 8 x half> undef, half undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_x = insertelement <vscale x 16 x half> undef, half undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_x = insertelement <vscale x 32 x half> undef, half undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = insertelement <vscale x 64 x half> undef, half undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_x = insertelement <vscale x 64 x half> undef, half undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_x = insertelement <2 x float> undef, float undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_x = insertelement <4 x float> undef, float undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_x = insertelement <8 x float> undef, float undef, i32 %x
@@ -1037,7 +1037,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_x = insertelement <vscale x 4 x float> undef, float undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_x = insertelement <vscale x 8 x float> undef, float undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_x = insertelement <vscale x 16 x float> undef, float undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = insertelement <vscale x 32 x float> undef, float undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_x = insertelement <vscale x 32 x float> undef, float undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_x = insertelement <2 x double> undef, double undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_x = insertelement <4 x double> undef, double undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_x = insertelement <8 x double> undef, double undef, i32 %x
@@ -1045,7 +1045,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_x = insertelement <vscale x 2 x double> undef, double undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_x = insertelement <vscale x 4 x double> undef, double undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_x = insertelement <vscale x 8 x double> undef, double undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = insertelement <vscale x 16 x double> undef, double undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_x = insertelement <vscale x 16 x double> undef, double undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'insertelement_fp'
@@ -1090,7 +1090,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = insertelement <vscale x 8 x half> undef, half undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = insertelement <vscale x 16 x half> undef, half undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = insertelement <vscale x 32 x half> undef, half undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = insertelement <vscale x 64 x half> undef, half undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = insertelement <vscale x 64 x half> undef, half undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = insertelement <4 x float> undef, float undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = insertelement <8 x float> undef, float undef, i32 1
@@ -1100,7 +1100,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = insertelement <vscale x 4 x float> undef, float undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = insertelement <vscale x 8 x float> undef, float undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = insertelement <vscale x 16 x float> undef, float undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = insertelement <vscale x 32 x float> undef, float undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = insertelement <vscale x 32 x float> undef, float undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = insertelement <4 x double> undef, double undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = insertelement <8 x double> undef, double undef, i32 1
@@ -1108,7 +1108,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = insertelement <vscale x 2 x double> undef, double undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = insertelement <vscale x 4 x double> undef, double undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = insertelement <vscale x 8 x double> undef, double undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = insertelement <vscale x 16 x double> undef, double undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = insertelement <vscale x 16 x double> undef, double undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_x = insertelement <2 x half> undef, half undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_x = insertelement <4 x half> undef, half undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_x = insertelement <8 x half> undef, half undef, i32 %x
@@ -1120,7 +1120,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_x = insertelement <vscale x 8 x half> undef, half undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_x = insertelement <vscale x 16 x half> undef, half undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_x = insertelement <vscale x 32 x half> undef, half undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = insertelement <vscale x 64 x half> undef, half undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_x = insertelement <vscale x 64 x half> undef, half undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_x = insertelement <2 x float> undef, float undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_x = insertelement <4 x float> undef, float undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_x = insertelement <8 x float> undef, float undef, i32 %x
@@ -1130,7 +1130,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_x = insertelement <vscale x 4 x float> undef, float undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_x = insertelement <vscale x 8 x float> undef, float undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_x = insertelement <vscale x 16 x float> undef, float undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = insertelement <vscale x 32 x float> undef, float undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_x = insertelement <vscale x 32 x float> undef, float undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_x = insertelement <2 x double> undef, double undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_x = insertelement <4 x double> undef, double undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_x = insertelement <8 x double> undef, double undef, i32 %x
@@ -1138,7 +1138,7 @@ define void @insertelement_fp(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_x = insertelement <vscale x 2 x double> undef, double undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_x = insertelement <vscale x 4 x double> undef, double undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_x = insertelement <vscale x 8 x double> undef, double undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = insertelement <vscale x 16 x double> undef, double undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_x = insertelement <vscale x 16 x double> undef, double undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'insertelement_fp'
@@ -1441,43 +1441,43 @@ define void @insertelement_fp(i32 %x) {
 
 define void @insertelement_int_nonpoweroftwo(i32 %x) {
 ; RV32V-LABEL: 'insertelement_int_nonpoweroftwo'
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64V-LABEL: 'insertelement_int_nonpoweroftwo'
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV32ZVE64X-LABEL: 'insertelement_int_nonpoweroftwo'
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64ZVE64X-LABEL: 'insertelement_int_nonpoweroftwo'
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x

From e5e38ddf1b8043324175868831da21e941c00aff Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Thu, 29 Aug 2024 09:50:44 -0700
Subject: [PATCH 17/72] [AArch64] Make apple-m4 armv8.7-a again (from
 armv9.2-a).  (#106312)

This is a partial revert of c66e1d6f3429.  Even though that
allowed us to declare v9.2-a support without picking up SVE2
in both the backend and the driver, the frontend itself still
enabled SVE via the arch version's default extensions.

Avoid that by reverting back to v8.7-a while we look into
longer-term solutions.
---
 clang/test/CodeGen/aarch64-targetattr.c          | 9 +++++++++
 llvm/lib/Target/AArch64/AArch64Processors.td     | 7 ++++++-
 llvm/unittests/TargetParser/TargetParserTest.cpp | 2 +-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index d6227be2ebef83..1bc78a6e1f8c0f 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -191,6 +191,14 @@ __attribute__((target("no-v9.3a")))
 //
 void minusarch() {}
 
+__attribute__((target("cpu=apple-m4")))
+// CHECK-LABEL: define {{[^@]+}}@applem4
+// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+void applem4() {}
+
 //.
 // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
@@ -210,6 +218,7 @@ void minusarch() {}
 // CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "branch-target-enforcement" "guarded-control-stack" "no-trapping-math"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.3a" }
+// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m4" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+sme,+sme-f64f64,+sme-i16i64,+sme2,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8a,+wfxt" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 84d8cae3a0a5d1..1d6c71cbbf0ec3 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -895,7 +895,12 @@ def ProcessorFeatures {
                                      FeatureLSE, FeaturePAuth, FeatureFPAC,
                                      FeatureRAS, FeatureRCPC, FeatureRDM,
                                      FeatureBF16, FeatureDotProd, FeatureMatMulInt8, FeatureSSBS];
-  list<SubtargetFeature> AppleM4 = [HasV9_2aOps, FeatureSHA2, FeatureFPARMv8,
+  // Technically apple-m4 is v9.2a, but we can't use that here.
+  // Historically, llvm defined v9.0a as requiring SVE, but it's optional
+  // according to the Arm ARM, and not supported by the core.  We decoupled the
+  // two in the clang driver and in the backend subtarget features, but it's
+  // still an issue in the clang frontend.  v8.7a is the next closest choice.
+  list<SubtargetFeature> AppleM4 = [HasV8_7aOps, FeatureSHA2, FeatureFPARMv8,
                                     FeatureNEON, FeaturePerfMon, FeatureSHA3,
                                     FeatureFullFP16, FeatureFP16FML,
                                     FeatureAES, FeatureBF16,
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 7d999b826252a2..13db80ab5c68ea 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1122,7 +1122,7 @@ INSTANTIATE_TEST_SUITE_P(
                       AArch64CPUTestParams("apple-a16", "armv8.6-a"),
                       AArch64CPUTestParams("apple-m3", "armv8.6-a"),
                       AArch64CPUTestParams("apple-a17", "armv8.6-a"),
-                      AArch64CPUTestParams("apple-m4", "armv9.2-a"),
+                      AArch64CPUTestParams("apple-m4", "armv8.7-a"),
                       AArch64CPUTestParams("exynos-m3", "armv8-a"),
                       AArch64CPUTestParams("exynos-m4", "armv8.2-a"),
                       AArch64CPUTestParams("exynos-m5", "armv8.2-a"),

From 3d08ade7bd32f0296e0ca3a13640cc95fa89229a Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 29 Aug 2024 17:53:32 +0100
Subject: [PATCH 18/72] [ExtendLifetimes] Implement llvm.fake.use to extend
 variable lifetimes (#86149)

This patch is part of a set of patches that add an `-fextend-lifetimes`
flag to clang, which extends the lifetimes of local variables and
parameters for improved debuggability. In addition to that flag, the
patch series adds a pragma to selectively disable `-fextend-lifetimes`,
and an `-fextend-this-ptr` flag which functions as `-fextend-lifetimes`
for this pointers only. All changes and tests in these patches were
written by Wolfgang Pieb (@wolfy1961), while Stephen Tozer (@SLTozer)
has handled review and merging. The extend lifetimes flag is intended to
eventually be set on by `-Og`, as discussed in the RFC
here:

https://discourse.llvm.org/t/rfc-redefine-og-o1-and-add-a-new-level-of-og/72850

This patch implements a new intrinsic instruction in LLVM,
`llvm.fake.use` in IR and `FAKE_USE` in MIR, that takes a single operand
and has no effect other than "using" its operand, to ensure that its
operand remains live until after the fake use. This patch does not emit
fake uses anywhere; the next patch in this sequence causes them to be
emitted from the clang frontend, such that for each variable (or this) a
fake.use operand is inserted at the end of that variable's scope, using
that variable's value. This patch covers everything post-frontend, which
is largely just the basic plumbing for a new intrinsic/instruction,
along with a few steps to preserve the fake uses through optimizations
(such as moving them ahead of a tail call or translating them through
SROA).

Co-authored-by: Stephen Tozer <stephen.tozer@sony.com>
---
 llvm/docs/LangRef.rst                         |  36 ++++
 llvm/include/llvm/Analysis/PtrUseVisitor.h    |   6 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   5 +
 llvm/include/llvm/CodeGen/MachineInstr.h      |   2 +
 llvm/include/llvm/CodeGen/Passes.h            |   3 +
 llvm/include/llvm/CodeGen/SelectionDAGISel.h  |   1 +
 llvm/include/llvm/IR/Intrinsics.td            |   3 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 .../llvm/Passes/MachinePassRegistry.def       |   1 +
 llvm/include/llvm/Support/TargetOpcodes.def   |   3 +
 llvm/include/llvm/Target/Target.td            |  10 ++
 llvm/lib/CodeGen/Analysis.cpp                 |   3 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  19 ++
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/CodeGen.cpp                  |   1 +
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  44 ++++-
 .../CodeGen/DeadMachineInstructionElim.cpp    |   3 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   8 +
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         |   3 +
 llvm/lib/CodeGen/MachineCSE.cpp               |   3 +-
 llvm/lib/CodeGen/MachineScheduler.cpp         |   3 +-
 llvm/lib/CodeGen/MachineSink.cpp              |   2 +-
 llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp  | 162 ++++++++++++++++++
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    |   3 +
 .../SelectionDAG/LegalizeFloatTypes.cpp       |  20 +++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  19 ++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   7 +
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |  11 ++
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  36 ++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  33 ++++
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  64 +++++++
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   1 +
 llvm/lib/IR/Instruction.cpp                   |   5 +-
 llvm/lib/IR/Verifier.cpp                      |   1 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   1 +
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp  |   1 +
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |   1 +
 llvm/lib/Target/X86/X86FloatingPoint.cpp      |  32 ++++
 llvm/lib/Transforms/Scalar/SROA.cpp           |  30 ++++
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   6 +
 llvm/lib/Transforms/Utils/Local.cpp           |   3 +
 .../Utils/PromoteMemoryToRegister.cpp         |   3 +-
 .../ScalarEvolution/flags-from-poison-dbg.ll  |   2 +-
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |   1 +
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |   1 +
 llvm/test/CodeGen/LoongArch/O0-pipeline.ll    |   1 +
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |   1 +
 .../CodeGen/MIR/X86/fake-use-tailcall.mir     |  99 +++++++++++
 llvm/test/CodeGen/PowerPC/O0-pipeline.ll      |   1 +
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |   1 +
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |   1 +
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 llvm/test/CodeGen/X86/O0-pipeline.ll          |   1 +
 llvm/test/CodeGen/X86/fake-use-hpfloat.ll     |  15 ++
 llvm/test/CodeGen/X86/fake-use-ld.ll          |  43 +++++
 llvm/test/CodeGen/X86/fake-use-scheduler.mir  | 123 +++++++++++++
 .../CodeGen/X86/fake-use-simple-tail-call.ll  |  24 +++
 .../CodeGen/X86/fake-use-suppress-load.ll     |  14 ++
 llvm/test/CodeGen/X86/fake-use-tailcall.ll    |  37 ++++
 llvm/test/CodeGen/X86/fake-use-vector.ll      |  39 +++++
 llvm/test/CodeGen/X86/fake-use-vector2.ll     |  27 +++
 llvm/test/CodeGen/X86/fake-use-zero-length.ll |  30 ++++
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   1 +
 .../DebugInfo/AArch64/fake-use-global-isel.ll |  98 +++++++++++
 llvm/test/DebugInfo/Inputs/check-fake-use.py  | 107 ++++++++++++
 llvm/test/DebugInfo/X86/fake-use.ll           |  96 +++++++++++
 .../GlobalISelCombinerEmitter/match-table.td  |  54 +++---
 .../CodeGenPrepare/X86/fake-use-phi.ll        |  50 ++++++
 .../CodeGenPrepare/X86/fake-use-split-ret.ll  |  37 ++++
 .../test/Transforms/GVN/fake-use-constprop.ll |  60 +++++++
 llvm/test/Transforms/SROA/fake-use-escape.ll  |  21 +++
 llvm/test/Transforms/SROA/fake-use-sroa.ll    |  52 ++++++
 .../gn/secondary/llvm/lib/CodeGen/BUILD.gn    |   1 +
 76 files changed, 1609 insertions(+), 38 deletions(-)
 create mode 100644 llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
 create mode 100644 llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
 create mode 100644 llvm/test/CodeGen/X86/fake-use-hpfloat.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-ld.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-scheduler.mir
 create mode 100644 llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-suppress-load.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-tailcall.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-vector.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-vector2.ll
 create mode 100644 llvm/test/CodeGen/X86/fake-use-zero-length.ll
 create mode 100644 llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll
 create mode 100644 llvm/test/DebugInfo/Inputs/check-fake-use.py
 create mode 100644 llvm/test/DebugInfo/X86/fake-use.ll
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll
 create mode 100644 llvm/test/Transforms/GVN/fake-use-constprop.ll
 create mode 100644 llvm/test/Transforms/SROA/fake-use-escape.ll
 create mode 100644 llvm/test/Transforms/SROA/fake-use-sroa.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8c696cb16e77f8..cf0a6f96fb012e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -29477,6 +29477,42 @@ execution, but is unknown at compile time.
 If the result value does not fit in the result type, then the result is
 a :ref:`poison value <poisonvalues>`.
 
+.. _llvm_fake_use:
+
+'``llvm.fake.use``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.fake.use(...)
+
+Overview:
+"""""""""
+
+The ``llvm.fake.use`` intrinsic is a no-op. It takes a single
+value as an operand and is treated as a use of that operand, to force the
+optimizer to preserve that value prior to the fake use. This is used for
+extending the lifetimes of variables, where this intrinsic placed at the end of
+a variable's scope helps prevent that variable from being optimized out.
+
+Arguments:
+""""""""""
+
+The ``llvm.fake.use`` intrinsic takes one argument, which may be any
+function-local SSA value. Note that the signature is variadic so that the
+intrinsic can take any type of argument, but passing more than one argument will
+result in an error.
+
+Semantics:
+""""""""""
+
+This intrinsic does nothing, but optimizers must consider it a use of its single
+operand and should try to preserve the intrinsic and its position in the
+function.
+
 
 Stack Map Intrinsics
 --------------------
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..f5c23b1b4e014d 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -278,6 +278,12 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
     default:
       return Base::visitIntrinsicInst(II);
 
+    // We escape pointers used by a fake_use to prevent SROA from transforming
+    // them.
+    case Intrinsic::fake_use:
+      PI.setEscaped(&II);
+      return;
+
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
       return; // No-op intrinsics.
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 86ff2628975942..187d624f0a73b9 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1372,6 +1372,11 @@ enum NodeType {
   LIFETIME_START,
   LIFETIME_END,
 
+  /// FAKE_USE represents a use of the operand but does not do anything.
+  /// Its purpose is the extension of the operand's lifetime mainly for
+  /// debugging purposes.
+  FAKE_USE,
+
   /// GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the
   /// beginning and end of GC transition  sequence, and carry arbitrary
   /// information that target might need for lowering.  The first operand is
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 04c8144f2fe7af..62667cc8ef3800 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1435,6 +1435,8 @@ class MachineInstr
     return getOpcode() == TargetOpcode::EXTRACT_SUBREG;
   }
 
+  bool isFakeUse() const { return getOpcode() == TargetOpcode::FAKE_USE; }
+
   /// Return true if the instruction behaves like a copy.
   /// This does not include native copy instructions.
   bool isCopyLike() const {
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index c7c2178571215b..dbdd110b0600e5 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -440,6 +440,9 @@ namespace llvm {
   // metadata after llvm SanitizerBinaryMetadata pass.
   extern char &MachineSanitizerBinaryMetadataID;
 
+  /// RemoveLoadsIntoFakeUses pass.
+  extern char &RemoveLoadsIntoFakeUsesID;
+
   /// RemoveRedundantDebugValues pass.
   extern char &RemoveRedundantDebugValuesID;
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index fc0590b1a1b69e..f6191c6fdb7fe6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -463,6 +463,7 @@ class SelectionDAGISel {
   void Select_READ_REGISTER(SDNode *Op);
   void Select_WRITE_REGISTER(SDNode *Op);
   void Select_UNDEF(SDNode *N);
+  void Select_FAKE_USE(SDNode *N);
   void CannotYetSelect(SDNode *N);
 
   void Select_FREEZE(SDNode *N);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e3bf0446575ae5..232d6be1073f49 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1835,6 +1835,9 @@ def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty],
                                 [IntrNoMem, IntrWillReturn, IntrConvergent],
                                 "llvm.is.constant">;
 
+// Introduce a use of the argument without generating any code.
+def int_fake_use : Intrinsic<[], [llvm_vararg_ty]>;
+
 // Intrinsic to mask out bits of a pointer.
 // First argument must be pointer or vector of pointer. This is checked by the
 // verifier.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index cc5e93c58f564a..47a1ca15fc0d1f 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -264,6 +264,7 @@ void initializeRegionOnlyViewerPass(PassRegistry &);
 void initializeRegionPrinterPass(PassRegistry &);
 void initializeRegionViewerPass(PassRegistry &);
 void initializeRegisterCoalescerPass(PassRegistry &);
+void initializeRemoveLoadsIntoFakeUsesPass(PassRegistry &);
 void initializeRemoveRedundantDebugValuesPass(PassRegistry &);
 void initializeRenameIndependentSubregsPass(PassRegistry &);
 void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 05baf514fa7210..b710b1c46f643f 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -250,6 +250,7 @@ DUMMY_MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass
 DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass)
 DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass)
 DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass)
+DUMMY_MACHINE_FUNCTION_PASS("remove-loads-into-fake-uses", RemoveLoadsIntoFakeUsesPass)
 DUMMY_MACHINE_FUNCTION_PASS("removeredundantdebugvalues", RemoveRedundantDebugValuesPass)
 DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass)
 DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass)
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 9fb6de49fb2055..635c265a433631 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -217,6 +217,9 @@ HANDLE_TARGET_OPCODE(PATCHABLE_TYPED_EVENT_CALL)
 
 HANDLE_TARGET_OPCODE(ICALL_BRANCH_FUNNEL)
 
+/// Represents a use of the operand but generates no code.
+HANDLE_TARGET_OPCODE(FAKE_USE)
+
 // This is a fence with the singlethread scope. It represents a compiler memory
 // barrier, but does not correspond to any generated instruction.
 HANDLE_TARGET_OPCODE(MEMBARRIER)
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 34332386085870..b2eb250ae60b60 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1418,6 +1418,16 @@ def FAULTING_OP : StandardPseudoInstruction {
   let isTerminator = true;
   let isBranch = true;
 }
+def FAKE_USE : StandardPseudoInstruction {
+  // An instruction that uses its operands but does nothing; this instruction
+  // will be treated specially by CodeGen passes, distinguishing it from any
+  // otherwise equivalent instructions.
+  let OutOperandList = (outs);
+  let InOperandList = (ins variable_ops);
+  let AsmString = "FAKE_USE";
+  let hasSideEffects = 0;
+  let isMeta = true;
+}
 def PATCHABLE_OP : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index 128060ec912c76..f77b733c6c8f69 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -567,7 +567,8 @@ bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM,
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
       if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
           II->getIntrinsicID() == Intrinsic::assume ||
-          II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl)
+          II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl ||
+          II->getIntrinsicID() == Intrinsic::fake_use)
         continue;
     if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
         !isSafeToSpeculativelyExecute(&*BBI))
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 60cb26973ead41..19d23c8ba96783 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1131,6 +1131,21 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
   AP.OutStreamer->addBlankLine();
 }
 
+static void emitFakeUse(const MachineInstr *MI, AsmPrinter &AP) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << "fake_use:";
+  for (const MachineOperand &Op : MI->operands()) {
+    // In some circumstances we can end up with fake uses of constants; skip
+    // these.
+    if (!Op.isReg())
+      continue;
+    OS << ' ' << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo());
+  }
+  AP.OutStreamer->AddComment(OS.str());
+  AP.OutStreamer->addBlankLine();
+}
+
 /// emitDebugValueComment - This method handles the target-independent form
 /// of DBG_VALUE, returning true if it was able to do so.  A false return
 /// means the target will need to handle MI in EmitInstruction.
@@ -1799,6 +1814,10 @@ void AsmPrinter::emitFunctionBody() {
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(&MI, *this);
         break;
+      case TargetOpcode::FAKE_USE:
+        if (isVerbose())
+          emitFakeUse(&MI, *this);
+        break;
       case TargetOpcode::PSEUDO_PROBE:
         emitPseudoProbe(MI);
         break;
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f1607f85c5b319..ae12ce1170f703 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -200,6 +200,7 @@ add_llvm_component_library(LLVMCodeGen
   RegisterUsageInfo.cpp
   RegUsageInfoCollector.cpp
   RegUsageInfoPropagate.cpp
+  RemoveLoadsIntoFakeUses.cpp
   ReplaceWithVeclib.cpp
   ResetMachineFunctionPass.cpp
   RegisterBank.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 31fa4c105cef80..177702054a0e31 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -116,6 +116,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegUsageInfoCollectorPass(Registry);
   initializeRegUsageInfoPropagationPass(Registry);
   initializeRegisterCoalescerPass(Registry);
+  initializeRemoveLoadsIntoFakeUsesPass(Registry);
   initializeRemoveRedundantDebugValuesPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index da6c758d53d487..271a047fc6a7b8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2800,12 +2800,34 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     return false;
   };
 
+  SmallVector<const IntrinsicInst *, 4> FakeUses;
+
+  auto isFakeUse = [&FakeUses](const Instruction *Inst) {
+    if (auto *II = dyn_cast<IntrinsicInst>(Inst);
+        II && II->getIntrinsicID() == Intrinsic::fake_use) {
+      // Record the instruction so it can be preserved when the exit block is
+      // removed. Do not preserve the fake use that uses the result of the
+      // PHI instruction.
+      // Do not copy fake uses that use the result of a PHI node.
+      // FIXME: If we do want to copy the fake use into the return blocks, we
+      // have to figure out which of the PHI node operands to use for each
+      // copy.
+      if (!isa<PHINode>(II->getOperand(0))) {
+        FakeUses.push_back(II);
+      }
+      return true;
+    }
+
+    return false;
+  };
+
   // Make sure there are no instructions between the first instruction
   // and return.
   const Instruction *BI = BB->getFirstNonPHI();
   // Skip over debug and the bitcast.
   while (isa<DbgInfoIntrinsic>(BI) || BI == BCI || BI == EVI ||
-         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI))
+         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI) ||
+         isFakeUse(BI))
     BI = BI->getNextNode();
   if (BI != RetI)
     return false;
@@ -2814,6 +2836,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
   /// call.
   const Function *F = BB->getParent();
   SmallVector<BasicBlock *, 4> TailCallBBs;
+  // Record the call instructions so we can insert any fake uses
+  // that need to be preserved before them.
+  SmallVector<CallInst *, 4> CallInsts;
   if (PN) {
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
       // Look through bitcasts.
@@ -2825,6 +2850,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
           TLI->mayBeEmittedAsTailCall(CI) &&
           attributesPermitTailCall(F, CI, RetI, *TLI)) {
         TailCallBBs.push_back(PredBB);
+        CallInsts.push_back(CI);
       } else {
         // Consider the cases in which the phi value is indirectly produced by
         // the tail call, for example when encountering memset(), memmove(),
@@ -2844,8 +2870,10 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
             isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
             IncomingVal == CI->getArgOperand(0) &&
             TLI->mayBeEmittedAsTailCall(CI) &&
-            attributesPermitTailCall(F, CI, RetI, *TLI))
+            attributesPermitTailCall(F, CI, RetI, *TLI)) {
           TailCallBBs.push_back(PredBB);
+          CallInsts.push_back(CI);
+        }
       }
     }
   } else {
@@ -2863,6 +2891,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
               (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
                V == CI->getArgOperand(0))) {
             TailCallBBs.push_back(Pred);
+            CallInsts.push_back(CI);
           }
         }
       }
@@ -2889,8 +2918,17 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
   }
 
   // If we eliminated all predecessors of the block, delete the block now.
-  if (Changed && !BB->hasAddressTaken() && pred_empty(BB))
+  if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) {
+    // Copy the fake uses found in the original return block to all blocks
+    // that contain tail calls.
+    for (auto *CI : CallInsts) {
+      for (auto const *FakeUse : FakeUses) {
+        auto *ClonedInst = FakeUse->clone();
+        ClonedInst->insertBefore(CI);
+      }
+    }
     BB->eraseFromParent();
+  }
 
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 7fc25cd889a0df..332ed37bd2b79f 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -87,7 +87,8 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const {
     return false;
 
   // Don't delete frame allocation labels.
-  if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE)
+  if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE ||
+      MI->getOpcode() == TargetOpcode::FAKE_USE)
     return false;
 
   // Don't delete instructions with side effects.
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f44af78cded46d..968d0a2a5c75e4 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2193,6 +2193,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     }
     return true;
   }
+  case Intrinsic::fake_use: {
+    SmallVector<llvm::SrcOp, 4> VRegs;
+    for (const auto &Arg : CI.args())
+      for (auto VReg : getOrCreateVRegs(*Arg))
+        VRegs.push_back(VReg);
+    MIRBuilder.buildInstr(TargetOpcode::FAKE_USE, std::nullopt, VRegs);
+    return true;
+  }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
     assert(DI.getVariable() && "Missing variable");
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index cfdd9905c16fa6..b1270e7aeb875c 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -228,6 +228,9 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // Don't delete frame allocation labels.
   if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE)
     return false;
+  // Don't delete fake uses.
+  if (MI.getOpcode() == TargetOpcode::FAKE_USE)
+    return false;
   // LIFETIME markers should be preserved even if they seem dead.
   if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
       MI.getOpcode() == TargetOpcode::LIFETIME_END)
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 27bbf5599b6046..aadc54b495fe22 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -406,7 +406,8 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
 
 bool MachineCSE::isCSECandidate(MachineInstr *MI) {
   if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
-      MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo())
+      MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo() ||
+      MI->isFakeUse())
     return false;
 
   // Ignore copies.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 7a3cf96ccffe0a..4e6d34346b1d80 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -530,7 +530,8 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
                             MachineBasicBlock *MBB,
                             MachineFunction *MF,
                             const TargetInstrInfo *TII) {
-  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
+  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF) ||
+         MI->isFakeUse();
 }
 
 /// A region of an MBB for scheduling.
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index fe515ef5be541f..609f9af9767f5d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -833,7 +833,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
     if (!ProcessedBegin)
       --I;
 
-    if (MI.isDebugOrPseudoInstr()) {
+    if (MI.isDebugOrPseudoInstr() || MI.isFakeUse()) {
       if (MI.isDebugValue())
         ProcessDbgInst(MI);
       continue;
diff --git a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
new file mode 100644
index 00000000000000..232181a199b8c2
--- /dev/null
+++ b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
@@ -0,0 +1,162 @@
+//===---- RemoveLoadsIntoFakeUses.cpp - Remove loads with no real uses ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The FAKE_USE instruction is used to preserve certain values through
+/// optimizations for the sake of debugging. This may result in spilled values
+/// being loaded into registers that are only used by FAKE_USEs; this is not
+/// necessary for debugging purposes, because at that point the value must be on
+/// the stack and hence available for debugging. Therefore, this pass removes
+/// loads that are only used by FAKE_USEs.
+///
+/// This pass should run very late, to ensure that we don't inadvertently
+/// shorten stack lifetimes by removing these loads, since the FAKE_USEs will
+/// also no longer be in effect. Running immediately before LiveDebugValues
+/// ensures that LDV will have accurate information of the machine location of
+/// debug values.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "remove-loads-into-fake-uses"
+
+STATISTIC(NumLoadsDeleted, "Number of dead load instructions deleted");
+STATISTIC(NumFakeUsesDeleted, "Number of FAKE_USE instructions deleted");
+
+class RemoveLoadsIntoFakeUses : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RemoveLoadsIntoFakeUses() : MachineFunctionPass(ID) {
+    initializeRemoveLoadsIntoFakeUsesPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "Remove Loads Into Fake Uses";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+char RemoveLoadsIntoFakeUses::ID = 0;
+char &llvm::RemoveLoadsIntoFakeUsesID = RemoveLoadsIntoFakeUses::ID;
+
+INITIALIZE_PASS_BEGIN(RemoveLoadsIntoFakeUses, DEBUG_TYPE,
+                      "Remove Loads Into Fake Uses", false, false)
+INITIALIZE_PASS_END(RemoveLoadsIntoFakeUses, DEBUG_TYPE,
+                    "Remove Loads Into Fake Uses", false, false)
+
+bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
+  // Only `optdebug` functions should contain FAKE_USEs, so don't try to run
+  // this for other functions.
+  if (!MF.getFunction().hasFnAttribute(Attribute::OptimizeForDebugging) ||
+      skipFunction(MF.getFunction()))
+    return false;
+
+  bool AnyChanges = false;
+
+  LiveRegUnits LivePhysRegs;
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+  SmallDenseMap<Register, SmallVector<MachineInstr *>> RegFakeUses;
+  LivePhysRegs.init(*TRI);
+  SmallVector<MachineInstr *, 16> Statepoints;
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
+    LivePhysRegs.addLiveOuts(*MBB);
+
+    for (MachineInstr &MI : make_early_inc_range(reverse(*MBB))) {
+      if (MI.isFakeUse()) {
+        for (const MachineOperand &MO : MI.operands()) {
+          // Track the Fake Uses that use this register so that we can delete
+          // them if we delete the corresponding load.
+          if (MO.isReg())
+            RegFakeUses[MO.getReg()].push_back(&MI);
+        }
+        // Do not record FAKE_USE uses in LivePhysRegs so that we can recognize
+        // otherwise-unused loads.
+        continue;
+      }
+
+      // If the restore size is not std::nullopt then we are dealing with a
+      // reload of a spilled register.
+      if (MI.getRestoreSize(TII)) {
+        Register Reg = MI.getOperand(0).getReg();
+        assert(Reg.isPhysical() && "VReg seen in function with NoVRegs set?");
+        // Don't delete live physreg defs, or any reserved register defs.
+        if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg))
+          continue;
+        // There should be an exact match between the loaded register and the
+        // FAKE_USE use. If not, this is a load that is unused by anything? It
+        // should probably be deleted, but that's outside of this pass' scope.
+        if (RegFakeUses.contains(Reg)) {
+          LLVM_DEBUG(dbgs() << "RemoveLoadsIntoFakeUses: DELETING: " << MI);
+          // It is possible that some DBG_VALUE instructions refer to this
+          // instruction. They will be deleted in the live debug variable
+          // analysis.
+          MI.eraseFromParent();
+          AnyChanges = true;
+          ++NumLoadsDeleted;
+          // Each FAKE_USE now appears to be a fake use of the previous value
+          // of the loaded register; delete them to avoid incorrectly
+          // interpreting them as such.
+          for (MachineInstr *FakeUse : RegFakeUses[Reg]) {
+            LLVM_DEBUG(dbgs()
+                       << "RemoveLoadsIntoFakeUses: DELETING: " << *FakeUse);
+            FakeUse->eraseFromParent();
+          }
+          NumFakeUsesDeleted += RegFakeUses[Reg].size();
+          RegFakeUses[Reg].clear();
+        }
+        continue;
+      }
+
+      // In addition to tracking LivePhysRegs, we need to clear RegFakeUses each
+      // time a register is defined, as existing FAKE_USEs no longer apply to
+      // that register.
+      if (!RegFakeUses.empty()) {
+        for (const MachineOperand &MO : MI.operands()) {
+          if (MO.isReg() && MO.isDef()) {
+            Register Reg = MO.getReg();
+            assert(Reg.isPhysical() &&
+                   "VReg seen in function with NoVRegs set?");
+            for (MCRegUnit Unit : TRI->regunits(Reg))
+              RegFakeUses.erase(Unit);
+          }
+        }
+      }
+      LivePhysRegs.stepBackward(MI);
+    }
+  }
+
+  return AnyChanges;
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 067f82c99adca1..162af2d9d708a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1464,6 +1464,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
+  case Intrinsic::fake_use:
+    // At -O0, we don't need fake use, so just ignore it.
+    return true;
   case Intrinsic::experimental_stackmap:
     return selectStackmap(II);
   case Intrinsic::experimental_patchpoint_void:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 221dcfe145594f..b5c80005a0ecc1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2438,6 +2438,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
       report_fatal_error("Do not know how to promote this operator's operand!");
 
     case ISD::BITCAST:    R = PromoteFloatOp_BITCAST(N, OpNo); break;
+    case ISD::FAKE_USE:
+      R = PromoteFloatOp_FAKE_USE(N, OpNo);
+      break;
     case ISD::FCOPYSIGN:  R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
@@ -2480,6 +2483,13 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo) {
   return DAG.getBitcast(N->getValueType(0), Convert);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_FAKE_USE(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op = GetPromotedFloat(N->getOperand(OpNo));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::Other, N->getOperand(0),
+                     Op);
+}
+
 // Promote Operand 1 of FCOPYSIGN.  Operand 0 ought to be handled by
 // PromoteFloatRes_FCOPYSIGN.
 SDValue DAGTypeLegalizer::PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo) {
@@ -3433,6 +3443,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
                        "operand!");
 
   case ISD::BITCAST:    Res = SoftPromoteHalfOp_BITCAST(N); break;
+  case ISD::FAKE_USE:
+    Res = SoftPromoteHalfOp_FAKE_USE(N, OpNo);
+    break;
   case ISD::FCOPYSIGN:  Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break;
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break;
@@ -3473,6 +3486,13 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FAKE_USE(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(OpNo));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::Other, N->getOperand(0),
+                     Op);
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N,
                                                       unsigned OpNo) {
   assert(OpNo == 1 && "Only Operand 1 must need promotion here");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c19a5a4995627a..05971152d535c6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1934,6 +1934,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::FAKE_USE:
+    Res = PromoteIntOp_FAKE_USE(N);
+    break;
   case ISD::INSERT_VECTOR_ELT:
     Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);
     break;
@@ -5280,6 +5283,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
   case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
   case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::FAKE_USE:
+    Res = ExpandOp_FAKE_USE(N);
+    break;
   case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
   case ISD::EXPERIMENTAL_VP_SPLAT:
@@ -6115,6 +6121,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_SUBVECTOR(SDNode *N) {
   return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
 }
 
+// FIXME: We wouldn't need this if clang could promote short integers
+// that are arguments to FAKE_USE.
+SDValue DAGTypeLegalizer::PromoteIntOp_FAKE_USE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = N->getOperand(1);
+  EVT InVT1 = V1.getValueType();
+  SDValue VPromoted =
+      DAG.getNode(ISD::ANY_EXTEND, dl,
+                  TLI.getTypeToTransformTo(*DAG.getContext(), InVT1), V1);
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), V0, VPromoted);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1088db4bdbe0b3..4577346a02d605 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -391,6 +391,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_INSERT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntOp_FAKE_USE(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_ScalarOp(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
@@ -755,6 +756,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FAKE_USE(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo);
@@ -800,6 +802,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_BITCAST(SDNode *N);
+  SDValue SoftPromoteHalfOp_FAKE_USE(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N);
   SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N);
@@ -877,6 +880,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecOp_VECREDUCE(SDNode *N);
   SDValue ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N);
   SDValue ScalarizeVecOp_CMP(SDNode *N);
+  SDValue ScalarizeVecOp_FAKE_USE(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Splitting Support: LegalizeVectorTypes.cpp
@@ -964,6 +968,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
+  SDValue SplitVecOp_FAKE_USE(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo);
@@ -1069,6 +1074,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_EXTEND_VECTOR_INREG(SDNode *N);
+  SDValue WidenVecOp_FAKE_USE(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
   SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo);
   SDValue WidenVecOp_VP_STRIDED_STORE(SDNode *N, unsigned OpNo);
@@ -1198,6 +1204,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ExpandOp_BITCAST          (SDNode *N);
   SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
   SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_FAKE_USE(SDNode *N);
   SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
   SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a55364ea2c4e5b..b402e823762764 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -403,6 +403,17 @@ SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
   return N->getConstantOperandVal(1) ? Hi : Lo;
 }
 
+// Split the integer operand in two and create a second FAKE_USE node for
+// the other half. The original SDNode is updated in place.
+SDValue DAGTypeLegalizer::ExpandOp_FAKE_USE(SDNode *N) {
+  SDValue Lo, Hi;
+  SDValue Chain = N->getOperand(0);
+  GetExpandedOp(N->getOperand(1), Lo, Hi);
+  SDValue LoUse = DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, Chain, Lo);
+  DAG.UpdateNodeOperands(N, LoUse, Hi);
+  return SDValue(N, 0);
+}
+
 SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
   // The vector type is legal but the element type needs expansion.
   EVT VecVT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 475d5806467d98..4c6da7c5df6b40 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -746,6 +746,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::BITCAST:
     Res = ScalarizeVecOp_BITCAST(N);
     break;
+  case ISD::FAKE_USE:
+    Res = ScalarizeVecOp_FAKE_USE(N);
+    break;
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -846,6 +849,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
                      N->getValueType(0), Elt);
 }
 
+// Need to legalize vector operands of fake uses. Must be <1 x ty>.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_FAKE_USE(SDNode *N) {
+  assert(N->getOperand(1).getValueType().getVectorNumElements() == 1 &&
+         "Fake Use: Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0), Elt);
+}
+
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
 /// Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
@@ -3291,6 +3302,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     Res = SplitVecOp_CMP(N);
     break;
 
+  case ISD::FAKE_USE:
+    Res = SplitVecOp_FAKE_USE(N);
+    break;
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
@@ -3505,6 +3519,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
 }
 
+// Split a FAKE_USE use of a vector into FAKE_USEs of hi and lo part.
+SDValue DAGTypeLegalizer::SplitVecOp_FAKE_USE(SDNode *N) {
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+  SDValue Chain =
+      DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0), Lo);
+  return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, Chain, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
   // For example, i64 = BITCAST v4i16 on alpha.  Typically the vector will
   // end up being split all the way down to individual components.  Convert the
@@ -6466,6 +6489,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
     report_fatal_error("Do not know how to widen this operator's operand!");
 
   case ISD::BITCAST:            Res = WidenVecOp_BITCAST(N); break;
+  case ISD::FAKE_USE:
+    Res = WidenVecOp_FAKE_USE(N);
+    break;
   case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
   case ISD::INSERT_SUBVECTOR:   Res = WidenVecOp_INSERT_SUBVECTOR(N); break;
   case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -6851,6 +6877,16 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   return CreateStackStoreLoad(InOp, VT);
 }
 
+// Vectors with sizes that are not powers of 2 need to be widened to the
+// next largest power of 2. For example, we may get a vector of 3 32-bit
+// integers or of 6 16-bit integers, both of which have to be widened to a
+// 128-bit vector.
+SDValue DAGTypeLegalizer::WidenVecOp_FAKE_USE(SDNode *N) {
+  SDValue WidenedOp = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0),
+                     WidenedOp);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ad24704d940a36..521a4fee8aafe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1622,6 +1622,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
     SDValue N = NodeMap[V];
     if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
       N = UnusedArgNodeMap[V];
+
     if (N.getNode()) {
       // Only emit func arg dbg value for non-variadic dbg.values for now.
       if (!IsVariadic &&
@@ -7703,6 +7704,38 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
 
+  case Intrinsic::fake_use: {
+    Value *V = I.getArgOperand(0);
+    SDValue Ops[2];
+    // For Values not declared or previously used in this basic block, the
+    // NodeMap will not have an entry, and `getValue` will assert if V has no
+    // valid register value.
+    auto FakeUseValue = [&]() -> SDValue {
+      SDValue &N = NodeMap[V];
+      if (N.getNode())
+        return N;
+
+      // If there's a virtual register allocated and initialized for this
+      // value, use it.
+      if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
+        return copyFromReg;
+      // FIXME: Do we want to preserve constants? It seems pointless.
+      if (isa<Constant>(V))
+        return getValue(V);
+      return SDValue();
+    }();
+    if (!FakeUseValue || FakeUseValue.isUndef())
+      return;
+    Ops[0] = getRoot();
+    Ops[1] = FakeUseValue;
+    // Also, do not translate a fake use with an undef operand, or any other
+    // empty SDValues.
+    if (!Ops[1] || Ops[1].isUndef())
+      return;
+    DAG.setRoot(DAG.getNode(ISD::FAKE_USE, sdl, MVT::Other, Ops));
+    return;
+  }
+
   case Intrinsic::eh_exceptionpointer:
   case Intrinsic::eh_exceptioncode: {
     // Get the exception pointer vreg, copy from it, and resize it to fit.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 001f782f209fdb..a253d1a0e20170 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -454,6 +454,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UBSANTRAP:                  return "ubsantrap";
   case ISD::LIFETIME_START:             return "lifetime.start";
   case ISD::LIFETIME_END:               return "lifetime.end";
+  case ISD::FAKE_USE:
+    return "fake_use";
   case ISD::PSEUDO_PROBE:
     return "pseudoprobe";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 09bde54b9aaa5d..8e268d4f4968ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -804,6 +804,50 @@ static void reportFastISelFailure(MachineFunction &MF,
   LLVM_DEBUG(dbgs() << R.getMsg() << "\n");
 }
 
+// Detect any fake uses that follow a tail call and move them before the tail
+// call. Ignore fake uses that use values that are def'd by or after the tail
+// call.
+static void preserveFakeUses(BasicBlock::iterator Begin,
+                             BasicBlock::iterator End) {
+  BasicBlock::iterator I = End;
+  if (--I == Begin || !isa<ReturnInst>(*I))
+    return;
+  // Detect whether there are any fake uses trailing a (potential) tail call.
+  bool HaveFakeUse = false;
+  bool HaveTailCall = false;
+  do {
+    if (const CallInst *CI = dyn_cast<CallInst>(--I))
+      if (CI->isTailCall()) {
+        HaveTailCall = true;
+        break;
+      }
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::fake_use)
+        HaveFakeUse = true;
+  } while (I != Begin);
+
+  // If we didn't find any tail calls followed by fake uses, we are done.
+  if (!HaveTailCall || !HaveFakeUse)
+    return;
+
+  SmallVector<IntrinsicInst *> FakeUses;
+  // Record the fake uses we found so we can move them to the front of the
+  // tail call. Ignore them if they use a value that is def'd by or after
+  // the tail call.
+  for (BasicBlock::iterator Inst = I; Inst != End; Inst++) {
+    if (IntrinsicInst *FakeUse = dyn_cast<IntrinsicInst>(Inst);
+        FakeUse && FakeUse->getIntrinsicID() == Intrinsic::fake_use) {
+      if (auto UsedDef = dyn_cast<Instruction>(FakeUse->getOperand(0));
+          !UsedDef || UsedDef->getParent() != I->getParent() ||
+          UsedDef->comesBefore(&*I))
+        FakeUses.push_back(FakeUse);
+    }
+  }
+
+  for (auto *Inst : FakeUses)
+    Inst->moveBefore(*Inst->getParent(), I);
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                         BasicBlock::const_iterator End,
                                         bool &HadTailCall) {
@@ -1665,6 +1709,16 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       FuncInfo->VisitedBBs[LLVMBB->getNumber()] = true;
     }
 
+    // Fake uses that follow tail calls are dropped. To avoid this, move
+    // such fake uses in front of the tail call, provided they don't
+    // use anything def'd by or after the tail call.
+    {
+      BasicBlock::iterator BBStart =
+          const_cast<BasicBlock *>(LLVMBB)->getFirstNonPHI()->getIterator();
+      BasicBlock::iterator BBEnd = const_cast<BasicBlock *>(LLVMBB)->end();
+      preserveFakeUses(BBStart, BBEnd);
+    }
+
     BasicBlock::const_iterator const Begin =
         LLVMBB->getFirstNonPHI()->getIterator();
     BasicBlock::const_iterator const End = LLVMBB->end();
@@ -2448,6 +2502,13 @@ void SelectionDAGISel::Select_UNDEF(SDNode *N) {
   CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
 }
 
+// Use the generic target FAKE_USE target opcode. The chain operand
+// must come last, because InstrEmitter::AddOperand() requires it.
+void SelectionDAGISel::Select_FAKE_USE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::FAKE_USE, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(0));
+}
+
 void SelectionDAGISel::Select_FREEZE(SDNode *N) {
   // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
   // If FREEZE instruction is added later, the code below must be changed as
@@ -3219,6 +3280,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::UNDEF:
     Select_UNDEF(NodeToMatch);
     return;
+  case ISD::FAKE_USE:
+    Select_FAKE_USE(NodeToMatch);
+    return;
   case ISD::FREEZE:
     Select_FREEZE(NodeToMatch);
     return;
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 1d52ebe6717f04..c0b834650d73b0 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1206,6 +1206,7 @@ void TargetPassConfig::addMachinePasses() {
   // addPreEmitPass.  Maybe only pass "false" here for those targets?
   addPass(&FuncletLayoutID);
 
+  addPass(&RemoveLoadsIntoFakeUsesID);
   addPass(&StackMapLivenessID);
   addPass(&LiveDebugValuesID);
   addPass(&MachineSanitizerBinaryMetadataID);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 6f0f3f244c050c..62d88ce21657b2 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1171,7 +1171,10 @@ Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
 const Instruction *
 Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
+    if (!isa<DbgInfoIntrinsic>(I) &&
+        !(SkipPseudoOp && isa<PseudoProbeInst>(I)) &&
+        !(isa<IntrinsicInst>(I) &&
+          cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fake_use))
       return I;
   return nullptr;
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 2c0f10a34f919d..79b3ca3b6a5a7e 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5128,6 +5128,7 @@ void Verifier::visitInstruction(Instruction &I) {
                 F->getIntrinsicID() ==
                     Intrinsic::experimental_patchpoint_void ||
                 F->getIntrinsicID() == Intrinsic::experimental_patchpoint ||
+                F->getIntrinsicID() == Intrinsic::fake_use ||
                 F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
                 F->getIntrinsicID() == Intrinsic::wasm_rethrow ||
                 IsAttachedCallOperand(F, CBI, i),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 097e29527eed9f..e86d3771bd2f26 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -315,6 +315,7 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
   disablePass(&ShrinkWrapID);
+  disablePass(&RemoveLoadsIntoFakeUsesID);
 
   addPass(createNVPTXAAWrapperPass());
   addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 48a2ce89bad390..7058b15d53aa0b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -140,6 +140,7 @@ void SPIRVPassConfig::addPostRegAlloc() {
   disablePass(&ShrinkWrapID);
   disablePass(&LiveDebugValuesID);
   disablePass(&MachineLateInstrsCleanupID);
+  disablePass(&RemoveLoadsIntoFakeUsesID);
 
   // Do not work with OpPhi.
   disablePass(&BranchFolderPassID);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 23539a5f4b26f1..73765f8fa0092c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -552,6 +552,7 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
   disablePass(&StackMapLivenessID);
   disablePass(&PatchableFunctionID);
   disablePass(&ShrinkWrapID);
+  disablePass(&RemoveLoadsIntoFakeUsesID);
 
   // This pass hurts code size for wasm because it can generate irreducible
   // control flow.
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 02c3ca9839fc2d..ea94a4be32b2fa 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -432,6 +432,24 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (MI.isCall())
       FPInstClass = X86II::SpecialFP;
 
+    // A fake_use with a floating point pseudo register argument that is
+    // killed must behave like any other floating point operation and pop
+    // the floating point stack (this is done in handleSpecialFP()).
+    // Fake_use is, however, unusual, in that sometimes its operand is not
+    // killed because a later instruction (probably a return) will use it.
+    // It is this instruction that will pop the stack.
+    // In this scenario we can safely remove the fake_use's operand
+    // (it is live anyway).
+    if (MI.isFakeUse()) {
+      const MachineOperand &MO = MI.getOperand(0);
+      if (MO.isReg() && X86::RFP80RegClass.contains(MO.getReg())) {
+        if (MO.isKill())
+          FPInstClass = X86II::SpecialFP;
+        else
+          MI.removeOperand(0);
+      }
+    }
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -1737,6 +1755,20 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     // Don't delete the inline asm!
     return;
   }
+
+  // FAKE_USE must pop its register operand off the stack if it is killed,
+  // because this constitutes the register's last use. If the operand
+  // is not killed, it will have its last use later, so we leave it alone.
+  // In either case we remove the operand so later passes don't see it.
+  case TargetOpcode::FAKE_USE: {
+    assert(MI.getNumExplicitOperands() == 1 &&
+           "FAKE_USE must have exactly one operand");
+    if (MI.getOperand(0).isKill()) {
+      freeStackSlotBefore(Inst, getFPReg(MI.getOperand(0)));
+    }
+    MI.removeOperand(0);
+    return;
+  }
   }
 
   Inst = MBB->erase(Inst);  // Remove the pseudo instruction
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 26b62cb79cdedf..2310cb3a7decbb 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3802,6 +3802,12 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
 
   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
     AAMDNodes AATags;
+    // A vector to hold the split components that we want to emit
+    // separate fake uses for.
+    SmallVector<Value *, 4> Components;
+    // A vector to hold all the fake uses of the struct that we are splitting.
+    // Usually there should only be one, but we are handling the general case.
+    SmallVector<Instruction *, 1> FakeUses;
 
     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
@@ -3826,10 +3832,32 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
           GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
         Load->setAAMetadata(
             AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
+      // Record the load so we can generate a fake use for this aggregate
+      // component.
+      Components.push_back(Load);
 
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
+
+    // Stash the fake uses that use the value generated by this instruction.
+    void recordFakeUses(LoadInst &LI) {
+      for (Use &U : LI.uses())
+        if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
+          if (II->getIntrinsicID() == Intrinsic::fake_use)
+            FakeUses.push_back(II);
+    }
+
+    // Replace all fake uses of the aggregate with a series of fake uses, one
+    // for each split component.
+    void emitFakeUses() {
+      for (Instruction *I : FakeUses) {
+        IRB.SetInsertPoint(I);
+        for (auto *V : Components)
+          IRB.CreateIntrinsic(Intrinsic::fake_use, {}, {V});
+        I->eraseFromParent();
+      }
+    }
   };
 
   bool visitLoadInst(LoadInst &LI) {
@@ -3841,8 +3869,10 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
     LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
                             getAdjustedAlignment(&LI, 0), DL, IRB);
+    Splitter.recordFakeUses(LI);
     Value *V = PoisonValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    Splitter.emitFakeUses();
     Visited.erase(&LI);
     LI.replaceAllUsesWith(V);
     LI.eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 47e3c03288d979..dc9ca1423f3e79 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -513,6 +513,12 @@ void PruningFunctionCloner::CloneBlock(
   for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE;
        ++II) {
 
+    // Don't clone fake_use as it may suppress many optimizations
+    // due to inlining, especially SROA.
+    if (auto *IntrInst = dyn_cast<IntrinsicInst>(II))
+      if (IntrInst->getIntrinsicID() == Intrinsic::fake_use)
+        continue;
+
     Instruction *NewInst = cloneInstruction(II);
     NewInst->insertInto(NewBB, NewBB->end());
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index e4809cd4bb44db..d0669e44f821b3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3491,6 +3491,9 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
 
   unsigned Count = 0;
   for (Use &U : llvm::make_early_inc_range(From->uses())) {
+    auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+    if (II && II->getIntrinsicID() == Intrinsic::fake_use)
+      continue;
     if (!ShouldReplace(Root, U))
       continue;
     LLVM_DEBUG(dbgs() << "Replace dominated use of '";
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 5251eb86bca926..1b7912fdf5e304 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -80,7 +80,8 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
       if (SI->isVolatile())
         return false;
     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
+      if (!II->isLifetimeStartOrEnd() && !II->isDroppable() &&
+          II->getIntrinsicID() != Intrinsic::fake_use)
         return false;
     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI))
diff --git a/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll b/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll
index 2370fe1468b4ed..5d304d1569d3f6 100644
--- a/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll
+++ b/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll
@@ -16,7 +16,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.body:                                         ; preds = %for.inc, %for.body.lr.ph
   %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
   %add = add nsw i32 %i.02, 50, !dbg !16
-  call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !18, metadata !19), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !18, metadata !19), !dbg !20
   %idxprom = sext i32 %add to i64, !dbg !21
 
 ; CHECK:  %idxprom = sext i32 %add to i64
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index ba611493e1a76e..ec6f24a5650fa3 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -69,6 +69,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       Workaround A53 erratum 835769 pass
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 3465b717261cf5..ffbe3dd377109f 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -224,6 +224,7 @@
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Workaround A53 erratum 835769 pass
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 29e8ebdafb5871..7bf1b8746fd87b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -143,6 +143,7 @@
 ; GCN-O0-NEXT:        Post RA hazard recognizer
 ; GCN-O0-NEXT:        Branch relaxation pass
 ; GCN-O0-NEXT:        Register Usage Information Collector Pass
+; GCN-O0-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O0-NEXT:        Live DEBUG_VALUE analysis
 ; GCN-O0-NEXT:        Machine Sanitizer Binary Metadata
 ; GCN-O0-NEXT:        Lazy Machine Block Frequency Analysis
@@ -420,6 +421,7 @@
 ; GCN-O1-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-NEXT:        Branch relaxation pass
 ; GCN-O1-NEXT:        Register Usage Information Collector Pass
+; GCN-O1-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O1-NEXT:        Live DEBUG_VALUE analysis
 ; GCN-O1-NEXT:        Machine Sanitizer Binary Metadata
 ; GCN-O1-NEXT:        Lazy Machine Block Frequency Analysis
@@ -725,6 +727,7 @@
 ; GCN-O1-OPTS-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O1-OPTS-NEXT:        Branch relaxation pass
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Collector Pass
+; GCN-O1-OPTS-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O1-OPTS-NEXT:        Live DEBUG_VALUE analysis
 ; GCN-O1-OPTS-NEXT:        Machine Sanitizer Binary Metadata
 ; GCN-O1-OPTS-NEXT:        Lazy Machine Block Frequency Analysis
@@ -1036,6 +1039,7 @@
 ; GCN-O2-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O2-NEXT:        Branch relaxation pass
 ; GCN-O2-NEXT:        Register Usage Information Collector Pass
+; GCN-O2-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O2-NEXT:        Live DEBUG_VALUE analysis
 ; GCN-O2-NEXT:        Machine Sanitizer Binary Metadata
 ; GCN-O2-NEXT:        Lazy Machine Block Frequency Analysis
@@ -1359,6 +1363,7 @@
 ; GCN-O3-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O3-NEXT:        Branch relaxation pass
 ; GCN-O3-NEXT:        Register Usage Information Collector Pass
+; GCN-O3-NEXT:        Remove Loads Into Fake Uses
 ; GCN-O3-NEXT:        Live DEBUG_VALUE analysis
 ; GCN-O3-NEXT:        Machine Sanitizer Binary Metadata
 ; GCN-O3-NEXT:        Lazy Machine Block Frequency Analysis
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 9b983d96f79330..819623d3fcc5a3 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -193,6 +193,7 @@
 ; CHECK-NEXT:      ARM block placement
 ; CHECK-NEXT:      optimise barriers pass
 ; CHECK-NEXT:      Contiguously Lay Out Funclets
+; CHECK-NEXT:      Remove Loads Into Fake Uses
 ; CHECK-NEXT:      StackMap Liveness Analysis
 ; CHECK-NEXT:      Live DEBUG_VALUE analysis
 ; CHECK-NEXT:      Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index 38c1dbcb1075fa..24bd4c75a9821e 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 50f154663177d0..53cdbd18f9b907 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -166,6 +166,7 @@
 ; LAXX-NEXT:       Implement the 'patchable-function' attribute
 ; LAXX-NEXT:       Branch relaxation pass
 ; LAXX-NEXT:       Contiguously Lay Out Funclets
+; LAXX-NEXT:       Remove Loads Into Fake Uses
 ; LAXX-NEXT:       StackMap Liveness Analysis
 ; LAXX-NEXT:       Live DEBUG_VALUE analysis
 ; LAXX-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
new file mode 100644
index 00000000000000..7eb8915f26a80f
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
@@ -0,0 +1,99 @@
+# In certain cases CodeGenPrepare folds a return instruction into
+# the return block's predecessor blocks and subsequently deletes the return block.
+# The purpose of this is to enable tail call optimization in the predecessor blocks.
+# Removal of the return block also removes fake use instructions that were present
+# in the return block, potentially causing debug information to be lost.
+#
+# The fix is to clone any fake use instructions that are not dominated by definitions
+# in the return block itself into the predecessor blocks. This test enures that we do so.
+#
+# Generated from the following source with
+# clang -fextend-lifetimes -S -emit-llvm -O2 -mllvm -stop-before=codegenprepare -o test.mir test.c
+#
+# extern int f0();
+# extern int f1();
+#
+# int foo(int i) {
+#   int temp = i;
+#   if (temp == 0)
+#     temp = f0();
+#   else
+#     temp = f1();
+#   return temp;
+# }
+#
+# RUN: llc -run-pass=codegenprepare -o - %s | FileCheck %s
+#
+# CHECK:      define{{.*}}foo
+# CHECK:      if.then:
+# CHECK-NEXT: call{{.*}}fake.use(i32 %i)
+# CHECK-NEXT: tail call i32{{.*}}@f0
+# CHECK-NEXT: ret
+# CHECK:      if.else:
+# CHECK-NEXT: call{{.*}}fake.use(i32 %i)
+# CHECK-NEXT: tail call i32{{.*}}@f1
+# CHECK-NEXT: ret
+
+--- |
+  define hidden i32 @foo(i32 %i) local_unnamed_addr optdebug {
+  entry:
+    %cmp = icmp eq i32 %i, 0
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:
+    %call = tail call i32 (...) @f0()
+    br label %if.end
+
+  if.else:
+    %call1 = tail call i32 (...) @f1()
+    br label %if.end
+
+  if.end:
+    %temp.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ]
+    notail call void (...) @llvm.fake.use(i32 %temp.0)
+    notail call void (...) @llvm.fake.use(i32 %i)
+    ret i32 %temp.0
+  }
+  declare i32 @f0(...) local_unnamed_addr
+  declare i32 @f1(...) local_unnamed_addr
+
+...
+---
+name:            foo
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+
+...
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index 4a17384e499936..5853647bf3b9f3 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -60,6 +60,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       PowerPC Pre-Emit Peephole
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 39b23a57513d9d..21bd4bb8502c3d 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -214,6 +214,7 @@
 ; CHECK-NEXT:       PowerPC Pre-Emit Peephole
 ; CHECK-NEXT:       PowerPC Early-Return Creation
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 7473809a2c5d2e..84c7f3f987c065 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -63,6 +63,7 @@
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 44c270fdc3c257..5d14d14d216244 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -189,6 +189,7 @@
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 98b86384b8443d..4c99dd830b442e 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -71,6 +71,7 @@
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll
new file mode 100644
index 00000000000000..7a95c38801837c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll
@@ -0,0 +1,15 @@
+; assert in DAGlegalizer with fake use of half precision float.
+; Changes to half float promotion.
+; RUN: llc -stop-after=finalize-isel -o - %s | FileCheck %s
+;
+; CHECK:      bb.0.entry:
+; CHECK-NEXT: %0:fr16 = FsFLD0SH
+; CHECK-NEXT: FAKE_USE killed %0
+;
+target triple = "x86_64-unknown-unknown"
+
+define void @_Z6doTestv() local_unnamed_addr optdebug {
+entry:
+  tail call void (...) @llvm.fake.use(half 0xH0000)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/fake-use-ld.ll b/llvm/test/CodeGen/X86/fake-use-ld.ll
new file mode 100644
index 00000000000000..86e7235091dd1c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-ld.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; Checks that fake uses of the FP stack do not cause a crash.
+;
+; /*******************************************************************/
+; extern long double foo(long double, long double, long double);
+;
+; long double actual(long double p1, long double p2, long double p3) {
+;   return fmal(p1, p2, p3);
+; }
+; /*******************************************************************/
+
+define x86_fp80 @actual(x86_fp80 %p1, x86_fp80 %p2, x86_fp80 %p3) optdebug {
+;
+; CHECK: actual
+;
+entry:
+  %p1.addr = alloca x86_fp80, align 16
+  %p2.addr = alloca x86_fp80, align 16
+  %p3.addr = alloca x86_fp80, align 16
+  store x86_fp80 %p1, ptr %p1.addr, align 16
+  store x86_fp80 %p2, ptr %p2.addr, align 16
+  store x86_fp80 %p3, ptr %p3.addr, align 16
+  %0 = load x86_fp80, ptr %p1.addr, align 16
+  %1 = load x86_fp80, ptr %p2.addr, align 16
+  %2 = load x86_fp80, ptr %p3.addr, align 16
+;
+; CHECK: callq{{.*}}foo
+;
+  %3 = call x86_fp80 @foo(x86_fp80 %0, x86_fp80 %1, x86_fp80 %2)
+  %4 = load x86_fp80, ptr %p1.addr, align 16
+  call void (...) @llvm.fake.use(x86_fp80 %4)
+  %5 = load x86_fp80, ptr %p2.addr, align 16
+  call void (...) @llvm.fake.use(x86_fp80 %5)
+  %6 = load x86_fp80, ptr %p3.addr, align 16
+  call void (...) @llvm.fake.use(x86_fp80 %6)
+;
+; CHECK: ret
+;
+  ret x86_fp80 %3
+}
+
+declare x86_fp80 @foo(x86_fp80, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/fake-use-scheduler.mir b/llvm/test/CodeGen/X86/fake-use-scheduler.mir
new file mode 100644
index 00000000000000..7e55f1d79aa7b6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-scheduler.mir
@@ -0,0 +1,123 @@
+# Prevent the machine scheduler from moving instructions past FAKE_USE.
+# RUN: llc -run-pass machine-scheduler -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s
+# REQUIRES: asserts
+#
+# We make sure that, beginning with the first FAKE_USE instruction,
+# no changes to the sequence of instructions are undertaken by the
+# scheduler. We don't bother to check that the order of the FAKE_USEs
+# remains the same. They should, but it is irrelevant.
+#
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: foo:%bb.0 entry
+# CHECK-NEXT:   From: %0:gr64 = COPY $rdi
+# CHECK-NEXT:     To: FAKE_USE %5:gr64
+# CHECK-NEXT:  RegionInstrs: 7
+#
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: bar:%bb.0 entry
+# CHECK-NEXT:   From: %0:gr64 = COPY $rdi
+# CHECK-NEXT:     To: RET 0, killed $rax
+# CHECK-NEXT:  RegionInstrs: 7
+#
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.ll"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  
+  @glb = common dso_local local_unnamed_addr global [100 x i32] zeroinitializer, align 16
+  
+  define dso_local i64 @foo(ptr %p) local_unnamed_addr optdebug {
+  entry:
+    %0 = load i32, ptr @glb, align 16
+    store i32 %0, ptr %p, align 4
+    %conv = sext i32 %0 to i64
+    %1 = load i32, ptr getelementptr inbounds ([100 x i32], ptr @glb, i64 0, i64 1), align 4
+    %arrayidx1 = getelementptr inbounds i32, ptr %p, i64 1
+    store i32 %1, ptr %arrayidx1, align 4
+    %conv2 = sext i32 %1 to i64
+    %add3 = add nsw i64 %conv2, %conv
+    notail call void (...) @llvm.fake.use(i64 %add3)
+    notail call void (...) @llvm.fake.use(i32 %1)
+    notail call void (...) @llvm.fake.use(i32 %0)
+    notail call void (...) @llvm.fake.use(ptr %p)
+    ret i64 %add3
+  }
+  
+  define dso_local i64 @bar(ptr %p) local_unnamed_addr optdebug {
+  entry:
+    %0 = load i32, ptr @glb, align 16
+    store i32 %0, ptr %p, align 4
+    %conv = sext i32 %0 to i64
+    %1 = load i32, ptr getelementptr inbounds ([100 x i32], ptr @glb, i64 0, i64 1), align 4
+    %arrayidx1 = getelementptr inbounds i32, ptr %p, i64 1
+    store i32 %1, ptr %arrayidx1, align 4
+    %conv2 = sext i32 %1 to i64
+    %add3 = add nsw i64 %conv2, %conv
+    ret i64 %add3
+  }
+  
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn
+  declare void @llvm.stackprotector(ptr, ptr)
+  
+...
+---
+name:            foo
+alignment:       16
+tracksRegLiveness: true
+debugInstrRef:   true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: gr64_with_sub_8bit, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr64_with_sub_8bit, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $rdi
+  
+    %0:gr64 = COPY $rdi
+    %1:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb, $noreg
+    MOV32mr %0, 1, $noreg, 0, $noreg, %1.sub_32bit
+    %3:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb + 4, $noreg
+    MOV32mr %0, 1, $noreg, 4, $noreg, %3.sub_32bit
+    %5:gr64 = COPY %3
+    %5:gr64 = nsw ADD64rr %5, %1, implicit-def dead $eflags
+    FAKE_USE %5
+    FAKE_USE %3.sub_32bit
+    FAKE_USE %1.sub_32bit
+    FAKE_USE %0
+    $rax = COPY %5
+    RET 0, killed $rax
+
+...
+---
+name:            bar
+alignment:       16
+tracksRegLiveness: true
+debugInstrRef:   true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: gr64_with_sub_8bit, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr64_with_sub_8bit, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr64_with_sub_8bit, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $rdi
+  
+    %0:gr64 = COPY $rdi
+    %1:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb, $noreg
+    MOV32mr %0, 1, $noreg, 0, $noreg, %1.sub_32bit
+    %5:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb + 4, $noreg
+    MOV32mr %0, 1, $noreg, 4, $noreg, %5.sub_32bit
+    %5:gr64_with_sub_8bit = nsw ADD64rr %5, %1, implicit-def dead $eflags
+    $rax = COPY %5
+    RET 0, killed $rax
+
+...
diff --git a/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll b/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll
new file mode 100644
index 00000000000000..45a210ef391009
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -o - \
+; RUN:   | FileCheck %s --implicit-check-not=TAILCALL
+; Generated with: clang -emit-llvm -O2 -S -fextend-lifetimes test.cpp -o -
+; =========== test.cpp ===============
+; extern int bar(int);
+; int foo1(int i)
+; {
+;     return bar(i);
+; }
+; =========== test.cpp ===============
+
+; CHECK: TAILCALL
+
+; ModuleID = 'test.cpp'
+source_filename = "test.cpp"
+
+define i32 @_Z4foo1i(i32 %i) local_unnamed_addr optdebug {
+entry:
+  %call = tail call i32 @_Z3bari(i32 %i)
+  tail call void (...) @llvm.fake.use(i32 %i)
+  ret i32 %call
+}
+
+declare i32 @_Z3bari(i32) local_unnamed_addr
diff --git a/llvm/test/CodeGen/X86/fake-use-suppress-load.ll b/llvm/test/CodeGen/X86/fake-use-suppress-load.ll
new file mode 100644
index 00000000000000..c1b442ebd79ffa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-suppress-load.ll
@@ -0,0 +1,14 @@
+; Suppress redundant loads feeding into fake uses.
+; RUN: llc -filetype=asm -o - %s --mtriple=x86_64-unknown-unknown | FileCheck %s
+; Windows ABI works differently, there's no offset.
+;
+; Look for the spill
+; CHECK:      movq %r{{[a-z]+,}} -{{[0-9]+\(%rsp\)}}
+; CHECK-NOT:  movq -{{[0-9]+\(%rsp\)}}, %r{{[a-z]+}}
+
+define dso_local i32 @f(ptr %p) local_unnamed_addr optdebug {
+entry:
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #1
+  notail call void (...) @llvm.fake.use(ptr %p)
+  ret i32 4
+}
diff --git a/llvm/test/CodeGen/X86/fake-use-tailcall.ll b/llvm/test/CodeGen/X86/fake-use-tailcall.ll
new file mode 100644
index 00000000000000..10bb22e1b564ab
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-tailcall.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -stop-after=finalize-isel - | FileCheck %s --implicit-check-not FAKE_USE
+; Fake uses following tail calls should be pulled in front
+; of the TCRETURN instruction. Fake uses using something defined by
+; the tail call or after it should be suppressed.
+
+; CHECK: name:{{ +}}bar
+; CHECK: body:
+; CHECK: bb.0.{{.*}}:
+; CHECK: %0:{{.*}}= COPY
+; CHECK: FAKE_USE %0
+; CHECK: TCRETURN
+
+; CHECK: name:{{ +}}baz
+; CHECK: body:
+; CHECK: bb.0.{{.*}}:
+; CHECK: %0:{{.*}}= COPY
+; CHECK: FAKE_USE %0
+; CHECK: TCRETURN
+
+define void @bar(i32 %v) optdebug {
+entry:
+  %call = tail call i32 @_Z3fooi(i32 %v)
+  %mul = mul nsw i32 %call, 3
+  notail call void (...) @llvm.fake.use(i32 %mul)
+  notail call void (...) @llvm.fake.use(i32 %call)
+  notail call void (...) @llvm.fake.use(i32 %v)
+  ret void
+}
+
+define i32 @baz(i32 %v) optdebug {
+entry:
+  %call = tail call i32 @_Z3fooi(i32 %v)
+  notail call void (...) @llvm.fake.use(i32 %v)
+  ret i32 %call
+}
+
+declare i32 @_Z3fooi(i32) local_unnamed_addr
diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll
new file mode 100644
index 00000000000000..cb46ccc8cac11c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-vector.ll
@@ -0,0 +1,39 @@
+; assert in DAGlegalizer with fake use of 1-element vectors.
+; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s
+;
+; ModuleID = 't2.cpp'
+; source_filename = "t2.cpp"
+; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+;
+; Check that we get past ISel and generate FAKE_USE machine instructions for
+; one-element vectors.
+;
+; CHECK:       bb.0.entry:
+; CHECK-DAG:   %1:gr64 = COPY $rdi
+; CHECK-DAG:   %0:vr128 = COPY $xmm0
+; CHECK:       %2:vr64 =
+; CHECK-DAG:   FAKE_USE %1
+; CHECK-DAG:   FAKE_USE %0
+; CHECK:       RET
+
+
+target triple = "x86_64-unknown-unknown"
+
+; Function Attrs: nounwind sspstrong uwtable
+define <4 x float> @_Z3runDv4_fDv1_x(<4 x float> %r, i64 %b.coerce) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <1 x i64> undef, i64 %b.coerce, i32 0
+  %1 = bitcast i64 %b.coerce to <1 x i64>
+  %2 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %r, <1 x i64> %1)
+  tail call void (...) @llvm.fake.use(<1 x i64> %0)
+  tail call void (...) @llvm.fake.use(<4 x float> %r)
+  ret <4 x float> %2
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
+
+; Function Attrs: nounwind
+declare void @llvm.fake.use(...)
+
+attributes #0 = { "target-cpu"="btver2" optdebug }
diff --git a/llvm/test/CodeGen/X86/fake-use-vector2.ll b/llvm/test/CodeGen/X86/fake-use-vector2.ll
new file mode 100644
index 00000000000000..6f2d3a5566dc67
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-vector2.ll
@@ -0,0 +1,27 @@
+; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s
+;
+; Make sure we can split vectors that are used as operands of FAKE_USE.
+
+; Generated from:
+;
+; typedef long __attribute__((ext_vector_type(8))) long8;
+; void test0() { long8 id208 {0, 1, 2, 3, 4, 5, 6, 7}; }
+
+; ModuleID = 't5.cpp'
+source_filename = "t5.cpp"
+
+
+; CHECK:     %0:vr256 = VMOV
+; CHECK:     %1:vr256 = VMOV
+; CHECK-DAG: FAKE_USE killed %1
+; CHECK-DAG: FAKE_USE killed %0
+; CHECK:     RET
+define void @_Z5test0v() local_unnamed_addr #0 {
+entry:
+  tail call void (...) @llvm.fake.use(<8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>) #1
+  ret void
+}
+
+declare void @llvm.fake.use(...)
+
+attributes #0 = { "target-cpu"="btver2" optdebug }
diff --git a/llvm/test/CodeGen/X86/fake-use-zero-length.ll b/llvm/test/CodeGen/X86/fake-use-zero-length.ll
new file mode 100644
index 00000000000000..e8c6791b8edff2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-zero-length.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -stop-after=finalize-isel | FileCheck %s --implicit-check-not=FAKE_USE
+;
+; Make sure SelectionDAG does not crash handling fake uses of zero-length arrays
+; and structs. Check also that they are not propagated.
+;
+; Generated from the following source with
+; clang -fextend-lifetimes -S -emit-llvm -O2 -mllvm -stop-after=safe-stack -o test.mir test.cpp
+;
+; int main ()
+; { int array[0]; }
+;
+;
+; CHECK: liveins: $[[IN_REG:[a-zA-Z0-9]+]]
+; CHECK: %[[IN_VREG:[a-zA-Z0-9]+]]:gr32 = COPY $[[IN_REG]]
+; CHECK: FAKE_USE %[[IN_VREG]]
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define hidden i32 @main([0 x i32] %zero, [1 x i32] %one) local_unnamed_addr optdebug {
+entry:
+  notail call void (...) @bar([0 x i32] %zero)
+  notail call void (...) @baz([1 x i32] %one)
+  notail call void (...) @llvm.fake.use([0 x i32] %zero)
+  notail call void (...) @llvm.fake.use([1 x i32] %one)
+  ret i32 0
+}
+
+declare void @bar([0 x i32] %a)
+declare void @baz([1 x i32] %a)
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 12c16a03b134c8..545640b7661691 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -211,6 +211,7 @@
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       Remove Loads Into Fake Uses
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
diff --git a/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll b/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll
new file mode 100644
index 00000000000000..65a64583096959
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll
@@ -0,0 +1,98 @@
+; REQUIRES: object-emission
+
+; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b'
+; to extend all the way to the end of the function.
+; Duplicates `DebugInfo/X86/fake-use.ll` for global-isel.
+
+; RUN: %llc_dwarf -O2 --global-isel=1 -mtriple=aarch64--linux-gnu -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: %python %p/../Inputs/check-fake-use.py %t
+; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s \
+; RUN:   | %llc_dwarf - -O2 --global-isel=1 -mtriple=aarch64--linux-gnu -filetype=obj -dwarf-linkage-names=Abstract \
+; RUN:   | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: not %python %p/../Inputs/check-fake-use.py %t
+
+; Generated with:
+; clang -O2 -g -S -emit-llvm -fextend-this-ptr fake-use.c
+;
+; int glob[10];
+; extern void bar();
+;
+; int foo(int b, int i)
+; {
+;    int loc = glob[i] * 2;
+;    if (b) {
+;      glob[2] = loc;
+;      bar();
+;    }
+;    return loc;
+; }
+;
+; ModuleID = 't2.c'
+source_filename = "t2.c"
+
+@glob = common local_unnamed_addr global [10 x i32] zeroinitializer, align 16, !dbg !0
+
+; Function Attrs: nounwind sspstrong uwtable
+define i32 @foo(i32 %b, i32 %i) local_unnamed_addr optdebug !dbg !13 {
+entry:
+    #dbg_value(i32 %b, !17, !20, !21)
+  %c = add i32 %b, 42
+  %tobool = icmp sgt i32 %c, 2, !dbg !27
+  tail call void (...) @bar() #2, !dbg !32
+  %idxprom = sext i32 %i to i64, !dbg !22
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @glob, i64 0, i64 %idxprom, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !23
+  %mul = shl nsw i32 %0, 1, !dbg !22
+  br i1 %tobool, label %if.end, label %if.then, !dbg !29
+
+if.then:                                          ; preds = %entry
+  store i32 %mul, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @glob, i64 0, i64 2), align 8, !dbg !30, !tbaa !23
+  tail call void (...) @bar() #2, !dbg !32
+  br label %if.end, !dbg !33
+
+if.end:                                           ; preds = %entry, %if.then
+  call void (...) @llvm.fake.use(i32 %b), !dbg !34
+  ret i32 %mul, !dbg !35
+}
+
+declare void @bar(...) local_unnamed_addr
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = distinct !DIGlobalVariableExpression(var: !DIGlobalVariable(name: "glob", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true), expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4)
+!2 = !DIFile(filename: "t2.c", directory: "/")
+!3 = !{}
+!4 = !{!0}
+!5 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 320, align: 32, elements: !7)
+!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!7 = !{!8}
+!8 = !DISubrange(count: 10)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"PIC Level", i32 2}
+!12 = !{!"clang version 4.0.0"}
+!13 = distinct !DISubprogram(name: "foo", scope: !2, file: !2, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !1, retainedNodes: !16)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!6, !6, !6}
+!16 = !{!17, !19}
+!17 = !DILocalVariable(name: "b", arg: 1, scope: !13, file: !2, line: 4, type: !6)
+!19 = !DILocalVariable(name: "loc", scope: !13, file: !2, line: 6, type: !6)
+!20 = !DIExpression()
+!21 = !DILocation(line: 4, scope: !13)
+!22 = !DILocation(line: 6, scope: !13)
+!23 = !{!24, !24, i64 0}
+!24 = !{!"int", !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
+!27 = !DILocation(line: 7, scope: !28)
+!28 = distinct !DILexicalBlock(scope: !13, file: !2, line: 7)
+!29 = !DILocation(line: 7, scope: !13)
+!30 = !DILocation(line: 8, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !28, file: !2, line: 7)
+!32 = !DILocation(line: 9, scope: !31)
+!33 = !DILocation(line: 10, scope: !31)
+!34 = !DILocation(line: 12, scope: !13)
+!35 = !DILocation(line: 11, scope: !13)
diff --git a/llvm/test/DebugInfo/Inputs/check-fake-use.py b/llvm/test/DebugInfo/Inputs/check-fake-use.py
new file mode 100644
index 00000000000000..7797e102419b24
--- /dev/null
+++ b/llvm/test/DebugInfo/Inputs/check-fake-use.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python3
+
+# Parsing dwarfdump's output to determine whether the location list for the
+# parameter "b" covers all of the function. The script searches for information
+# in the input file to determine the [prologue, epilogue) range for the
+# function, the location list range for "b", and checks that the latter covers
+# the entirety of the former.
+import re
+import sys
+
+DebugInfoPattern = r"\.debug_info contents:"
+DebugLinePattern = r"\.debug_line contents:"
+ProloguePattern = r"^\s*0x([0-9a-f]+)\s.+prologue_end"
+EpiloguePattern = r"^\s*0x([0-9a-f]+)\s.+epilogue_begin"
+FormalPattern = r"^0x[0-9a-f]+:\s+DW_TAG_formal_parameter"
+LocationPattern = r"DW_AT_location\s+\[DW_FORM_([a-z_]+)\](?:.*0x([a-f0-9]+))"
+DebugLocPattern = r'\[0x([a-f0-9]+),\s+0x([a-f0-9]+)\) ".text": (.+)$'
+
+SeenDebugInfo = False
+SeenDebugLine = False
+LocationRanges = None
+PrologueEnd = None
+EpilogueBegin = None
+
+# The dwarfdump output should contain the DW_AT_location for "b" first, then the
+# line table which should contain prologue_end and epilogue_begin entries.
+with open(sys.argv[1], "r") as dwarf_dump_file:
+    dwarf_iter = iter(dwarf_dump_file)
+    for line in dwarf_iter:
+        if not SeenDebugInfo and re.match(DebugInfoPattern, line):
+            SeenDebugInfo = True
+        if not SeenDebugLine and re.match(DebugLinePattern, line):
+            SeenDebugLine = True
+        # Get the range of DW_AT_location for "b".
+        if LocationRanges is None:
+            if match := re.match(FormalPattern, line):
+                # Go until we either find DW_AT_location or reach the end of this entry.
+                location_match = None
+                while location_match is None:
+                    if (line := next(dwarf_iter, "")) == "\n":
+                        raise RuntimeError(
+                            ".debug_info output is missing DW_AT_location for 'b'"
+                        )
+                    location_match = re.search(LocationPattern, line)
+                # Variable has whole-scope location, represented by an empty tuple.
+                if location_match.group(1) == "exprloc":
+                    LocationRanges = ()
+                    continue
+                if location_match.group(1) != "sec_offset":
+                    raise RuntimeError(
+                        f"Unhandled form for DW_AT_location: DW_FORM_{location_match.group(1)}"
+                    )
+                # Variable has location range list.
+                if (
+                    debug_loc_match := re.search(DebugLocPattern, next(dwarf_iter, ""))
+                ) is None:
+                    raise RuntimeError(f"Invalid location range list for 'b'")
+                LocationRanges = (
+                    int(debug_loc_match.group(1), 16),
+                    int(debug_loc_match.group(2), 16),
+                )
+                while (
+                    debug_loc_match := re.search(DebugLocPattern, next(dwarf_iter, ""))
+                ) is not None:
+                    match_loc_start = int(debug_loc_match.group(1), 16)
+                    match_loc_end = int(debug_loc_match.group(2), 16)
+                    match_expr = debug_loc_match.group(3)
+                    if match_loc_start != LocationRanges[1]:
+                        raise RuntimeError(
+                            f"Location list for 'b' is discontinuous from [0x{LocationRanges[1]:x}, 0x{match_loc_start:x})"
+                        )
+                    if "stack_value" in match_expr:
+                        raise RuntimeError(
+                            f"Location list for 'b' contains a stack_value expression: {match_expr}"
+                        )
+                    LocationRanges = (LocationRanges[0], match_loc_end)
+        # Get the prologue_end address.
+        elif PrologueEnd is None:
+            if match := re.match(ProloguePattern, line):
+                PrologueEnd = int(match.group(1), 16)
+        # Get the epilogue_begin address.
+        elif EpilogueBegin is None:
+            if match := re.match(EpiloguePattern, line):
+                EpilogueBegin = int(match.group(1), 16)
+                break
+
+if not SeenDebugInfo:
+    raise RuntimeError(".debug_info section not found.")
+if not SeenDebugLine:
+    raise RuntimeError(".debug_line section not found.")
+
+if LocationRanges is None:
+    raise RuntimeError(".debug_info output is missing parameter 'b'")
+if PrologueEnd is None:
+    raise RuntimeError(".debug_line output is missing prologue_end")
+if EpilogueBegin is None:
+    raise RuntimeError(".debug_line output is missing epilogue_begin")
+
+if len(LocationRanges) == 2 and (
+    LocationRanges[0] > PrologueEnd or LocationRanges[1] < EpilogueBegin
+):
+    raise RuntimeError(
+        f"""Location list for 'b' does not cover the whole function:")
+    Prologue to Epilogue = [0x{PrologueEnd:x}, 0x{EpilogueBegin:x})
+    Location range = [0x{LocationRanges[0]:x}, 0x{LocationRanges[1]:x})
+"""
+    )
diff --git a/llvm/test/DebugInfo/X86/fake-use.ll b/llvm/test/DebugInfo/X86/fake-use.ll
new file mode 100644
index 00000000000000..f44aadfeef5640
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/fake-use.ll
@@ -0,0 +1,96 @@
+; REQUIRES: object-emission
+
+; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b'
+; to extend all the way to the end of the function.
+
+; RUN: %llc_dwarf -O2 -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: %python %p/../Inputs/check-fake-use.py %t
+; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: not %python %p/../Inputs/check-fake-use.py %t
+
+; Generated with:
+; clang -O2 -g -S -emit-llvm -fextend-this-ptr fake-use.c
+;
+; int glob[10];
+; extern void bar();
+;
+; int foo(int b, int i)
+; {
+;    int loc = glob[i] * 2;
+;    if (b) {
+;      glob[2] = loc;
+;      bar();
+;    }
+;    return loc;
+; }
+;
+; ModuleID = 't2.c'
+source_filename = "t2.c"
+
+@glob = common local_unnamed_addr global [10 x i32] zeroinitializer, align 16, !dbg !0
+
+; Function Attrs: nounwind sspstrong uwtable
+define i32 @foo(i32 %b, i32 %i) local_unnamed_addr optdebug !dbg !13 {
+entry:
+    #dbg_value(i32 %b, !17, !20, !21)
+  %c = add i32 %b, 42
+  %tobool = icmp sgt i32 %c, 2, !dbg !27
+  tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
+  tail call void (...) @bar() #2, !dbg !32
+  %idxprom = sext i32 %i to i64, !dbg !22
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @glob, i64 0, i64 %idxprom, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !23
+  %mul = shl nsw i32 %0, 1, !dbg !22
+  br i1 %tobool, label %if.end, label %if.then, !dbg !29
+
+if.then:                                          ; preds = %entry
+  store i32 %mul, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @glob, i64 0, i64 2), align 8, !dbg !30, !tbaa !23
+  tail call void (...) @bar() #2, !dbg !32
+  br label %if.end, !dbg !33
+
+if.end:                                           ; preds = %entry, %if.then
+  call void (...) @llvm.fake.use(i32 %b), !dbg !34
+  ret i32 %mul, !dbg !35
+}
+
+declare void @bar(...) local_unnamed_addr
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = distinct !DIGlobalVariableExpression(var: !DIGlobalVariable(name: "glob", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true), expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4)
+!2 = !DIFile(filename: "t2.c", directory: "/")
+!3 = !{}
+!4 = !{!0}
+!5 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 320, align: 32, elements: !7)
+!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!7 = !{!8}
+!8 = !DISubrange(count: 10)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"PIC Level", i32 2}
+!12 = !{!"clang version 4.0.0"}
+!13 = distinct !DISubprogram(name: "foo", scope: !2, file: !2, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !1, retainedNodes: !16)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!6, !6, !6}
+!16 = !{!17, !19}
+!17 = !DILocalVariable(name: "b", arg: 1, scope: !13, file: !2, line: 4, type: !6)
+!19 = !DILocalVariable(name: "loc", scope: !13, file: !2, line: 6, type: !6)
+!20 = !DIExpression()
+!21 = !DILocation(line: 4, scope: !13)
+!22 = !DILocation(line: 6, scope: !13)
+!23 = !{!24, !24, i64 0}
+!24 = !{!"int", !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
+!27 = !DILocation(line: 7, scope: !28)
+!28 = distinct !DILexicalBlock(scope: !13, file: !2, line: 7)
+!29 = !DILocation(line: 7, scope: !13)
+!30 = !DILocation(line: 8, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !28, file: !2, line: 7)
+!32 = !DILocation(line: 9, scope: !31)
+!33 = !DILocation(line: 10, scope: !31)
+!34 = !DILocation(line: 12, scope: !13)
+!35 = !DILocation(line: 11, scope: !13)
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index b52849b6bc931d..00601b7ae6e0d2 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -136,14 +136,14 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
 // CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 6*/ GIMT_Encode4([[#DEFAULT:]]),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(470), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(506), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(553), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(587), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(610), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(622),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(474), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(510), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(591), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(614), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(626),
 // CHECK-NEXT:     // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]]
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(494), // Rule ID 4 //
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(498), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -156,8 +156,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIM_CheckIsSafeToFold, /*NumInsns*/1,
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
-// CHECK-NEXT:     // Label 7: @494
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(505), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @498
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(509), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
-// CHECK-NEXT:     // Label 8: @505
+// CHECK-NEXT:     // Label 8: @509
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @506
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(552), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @510
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -185,10 +185,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 9: @552
+// CHECK-NEXT:     // Label 9: @556
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @553
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(586), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @557
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(590), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -204,29 +204,29 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/1, // ptr
 // CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 10: @586
+// CHECK-NEXT:     // Label 10: @590
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @587
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(598), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @591
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(602), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 11: @598
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(609), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @602
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(613), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 12: @609
+// CHECK-NEXT:     // Label 12: @613
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @610
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(621), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @614
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(625), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 13: @621
+// CHECK-NEXT:     // Label 13: @625
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @622
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(656), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @626
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -240,7 +240,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 14: @656
+// CHECK-NEXT:     // Label 14: @660
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     // Label 6: @[[#%u, DEFAULT]]
 // CHECK-NEXT:     GIM_Reject,
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll
new file mode 100644
index 00000000000000..064d3f29dd9ebb
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -passes='require<profile-summary>,function(codegenprepare)' -S -mtriple=x86_64 | FileCheck %s --implicit-check-not="llvm.fake.use"
+;
+; When performing return duplication to enable
+; tail call optimization we clone fake uses that exist in the to-be-eliminated
+; return block into the predecessor blocks. When doing this with fake uses
+; of PHI-nodes, they cannot be easily copied, but require the correct operand.
+; We are currently not able to do this correctly, so we suppress the cloning
+; of such fake uses at the moment.
+;
+; There should be no fake use of a call result in any of the resulting return
+; blocks.
+
+; Fake uses of `this` should be duplicated into both return blocks.
+; CHECK: if.then:
+; CHECK: @llvm.fake.use({{.*}}this
+; CHECK: if.else:
+; CHECK: @llvm.fake.use({{.*}}this
+
+; CHECK: declare void @llvm.fake.use
+
+source_filename = "test.ll"
+
+%class.a = type { i8 }
+
+declare i32 @foo(ptr nonnull dereferenceable(1)) local_unnamed_addr
+declare i32 @bar(ptr nonnull dereferenceable(1)) local_unnamed_addr
+
+define hidden void @func(ptr nonnull dereferenceable(1) %this) local_unnamed_addr align 2 optdebug {
+entry:
+  %b = getelementptr inbounds %class.a, ptr %this, i64 0, i32 0
+  %0 = load i8, i8* %b, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @foo(ptr nonnull dereferenceable(1) %this)
+  %call2 = tail call i32 @bar(ptr nonnull dereferenceable(1) %this)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call4 = tail call i32 @bar(ptr nonnull dereferenceable(1) %this)
+  %call5 = tail call i32 @foo(ptr nonnull dereferenceable(1) %this)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %call4.sink = phi i32 [ %call4, %if.else ], [ %call, %if.then ]
+  notail call void (...) @llvm.fake.use(i32 %call4.sink)
+  notail call void (...) @llvm.fake.use(ptr nonnull %this)
+  ret void
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll
new file mode 100644
index 00000000000000..b2cf89f6f2dd82
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll
@@ -0,0 +1,37 @@
+; RUN: opt -mtriple=x86_64-unknown-unknown -S -codegenprepare <%s -o - | FileCheck %s
+;
+; Ensure return instruction splitting ignores fake uses.
+;
+; IR Generated with clang -O2 -S -emit-llvm -fextend-lifetimes test.cpp
+;
+;// test.cpp
+;extern int bar(int);
+;
+;int foo2(int i)
+;{
+;  --i;
+;  if (i <= 0)
+;    return -1;
+;  return bar(i);
+;}
+
+declare i32 @_Z3bari(i32) local_unnamed_addr
+
+define i32 @_Z4foo2i(i32 %i) local_unnamed_addr optdebug {
+entry:
+  %dec = add nsw i32 %i, -1
+  %cmp = icmp slt i32 %i, 2
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %call = tail call i32 @_Z3bari(i32 %dec)
+; CHECK: ret i32 %call
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+; CHECK: cleanup:
+  %retval.0 = phi i32 [ %call, %if.end ], [ -1, %entry ]
+  tail call void (...) @llvm.fake.use(i32 %dec)
+; CHECK: ret i32 -1
+  ret i32 %retval.0
+}
diff --git a/llvm/test/Transforms/GVN/fake-use-constprop.ll b/llvm/test/Transforms/GVN/fake-use-constprop.ll
new file mode 100644
index 00000000000000..1466f9f9fca277
--- /dev/null
+++ b/llvm/test/Transforms/GVN/fake-use-constprop.ll
@@ -0,0 +1,60 @@
+; RUN: opt -passes=gvn -S < %s | FileCheck %s
+;
+; The Global Value Numbering pass (GVN) propagates boolean values
+; that are constant in dominated basic blocks to all the uses
+; in these basic blocks. However, we don't want the constant propagated
+; into fake.use intrinsics since this would render the intrinsic useless
+; with respect to keeping the variable live up until the fake.use.
+; This test checks that we don't generate any fake.uses with constant 0.
+;
+; Reduced from the following test case, generated with clang -O2 -S -emit-llvm -fextend-lifetimes test.c
+;
+; extern void func1();
+; extern int bar();
+; extern void baz(int);
+;
+; int foo(int i, float f, int *punused)
+; {
+;   int j = 3*i;
+;   if (j > 0) {
+;     int m = bar(i);
+;     if (m) {
+;       char b = f;
+;       baz(b);
+;       if (b)
+;         goto lab;
+;       func1();
+;     }
+; lab:
+;     func1();
+;   }
+;   return 1;
+; }
+
+;; GVN should propagate a constant value through to a regular call, but not to
+;; a fake use, which should continue to track the original value.
+; CHECK: %[[CONV_VAR:[a-zA-Z0-9]+]] = fptosi
+; CHECK: call {{.+}} @bees(i8 0)
+; CHECK: call {{.+}} @llvm.fake.use(i8 %[[CONV_VAR]])
+
+define i32 @foo(float %f) optdebug {
+  %conv = fptosi float %f to i8
+  %tobool3 = icmp eq i8 %conv, 0
+  br i1 %tobool3, label %if.end, label %lab
+
+if.end:
+  tail call void (...) @bees(i8 %conv)
+  tail call void (...) @llvm.fake.use(i8 %conv)
+  br label %lab
+
+lab:
+  ret i32 1
+}
+
+declare i32 @bar(...)
+
+declare void @baz(i32)
+
+declare void @bees(i32)
+
+declare void @func1(...)
diff --git a/llvm/test/Transforms/SROA/fake-use-escape.ll b/llvm/test/Transforms/SROA/fake-use-escape.ll
new file mode 100644
index 00000000000000..5429d09740e522
--- /dev/null
+++ b/llvm/test/Transforms/SROA/fake-use-escape.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -passes=sroa %s | FileCheck %s
+;
+;; Check that we do not assert and that we retain the fake_use instruction that
+;; uses the address of bar.
+;
+; CHECK: define{{.*}}foo
+; CHECK: call{{.*llvm\.fake\.use.*}}(ptr %bar.addr)
+
+define void @_Z3fooPi(ptr %bar) {
+entry:
+  %bar.addr = alloca ptr, align 8
+  %baz = alloca ptr, align 8
+  store ptr %bar, ptr %bar.addr, align 8
+  store ptr %bar.addr, ptr %baz, align 8
+  %0 = load ptr, ptr %bar.addr, align 8
+  %1 = load ptr, ptr %baz, align 8
+  call void (...) @llvm.fake.use(ptr %1)
+  ret void
+}
+
+declare void @llvm.fake.use(...)
diff --git a/llvm/test/Transforms/SROA/fake-use-sroa.ll b/llvm/test/Transforms/SROA/fake-use-sroa.ll
new file mode 100644
index 00000000000000..9e92df15487506
--- /dev/null
+++ b/llvm/test/Transforms/SROA/fake-use-sroa.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -passes=sroa %s | FileCheck %s
+; With fake use instrinsics generated for small aggregates, check that when
+; SROA slices the aggregate, we generate individual fake use intrinsics for
+; the individual values.
+
+; Generated from the following source:
+; struct s {
+;   int i;
+;   int j;
+; };
+;
+; void foo(struct s S) {
+; }
+;
+; void bar() {
+;   int arr[2] = {5, 6};
+; }
+;
+%struct.s = type { i32, i32 }
+@__const.bar.arr = private unnamed_addr constant [2 x i32] [i32 5, i32 6], align 4
+
+; A small struct passed as parameter
+; CHECK-LABEL: define{{.*}}foo
+; CHECK:       %[[SLICE1:[^ ]+]] = trunc i64
+; CHECK:       %[[SLICE2:[^ ]+]] = trunc i64
+; CHECK-DAG:   call{{.*}} @llvm.fake.use(i32 %[[SLICE1]])
+; CHECK-DAG:   call{{.*}} @llvm.fake.use(i32 %[[SLICE2]])
+define dso_local void @foo(i64 %S.coerce) optdebug {
+entry:
+  %S = alloca %struct.s, align 4
+  store i64 %S.coerce, ptr %S, align 4
+  %fake.use = load %struct.s, ptr %S, align 4
+  notail call void (...) @llvm.fake.use(%struct.s %fake.use)
+  ret void
+}
+
+; A local variable with a small array type.
+; CHECK-LABEL: define{{.*}}bar
+; CHECK:       %[[ARRAYSLICE1:[^ ]+]] = load
+; CHECK:       %[[ARRAYSLICE2:[^ ]+]] = load
+; CHECK-DAG:   call{{.*}} @llvm.fake.use(i32 %[[ARRAYSLICE1]])
+; CHECK-DAG:   call{{.*}} @llvm.fake.use(i32 %[[ARRAYSLICE2]])
+define dso_local void @bar() optdebug {
+entry:
+  %arr = alloca [2 x i32], align 4
+  call void @llvm.memcpy.p0i8.p0i8.i64(ptr align 4 %arr, ptr align 4 bitcast (ptr @__const.bar.arr to ptr), i64 8, i1 false)
+  %fake.use = load [2 x i32], ptr %arr, align 4
+  notail call void (...) @llvm.fake.use([2 x i32] %fake.use)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index bbd1f9af65095a..3a4c7ea03b3aa9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -201,6 +201,7 @@ static_library("CodeGen") {
     "RegisterPressure.cpp",
     "RegisterScavenging.cpp",
     "RegisterUsageInfo.cpp",
+    "RemoveLoadsIntoFakeUses.cpp",
     "RemoveRedundantDebugValues.cpp",
     "RenameIndependentSubregs.cpp",
     "ReplaceWithVeclib.cpp",

From 74b4ec17e24a256b4aae5e53b855ba429af685bf Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 09:57:58 -0700
Subject: [PATCH 19/72] [VP] Remove VP_PROPERTY_REDUCTION and VP_PROPERTY_CMP
 [nfc] (#105551)

These lists are quite static and several of the parameters are actually
constant across all users. Heavy use of macros is undesirable, and not
idiomatic in LLVM, so let's just use the naive switch cases.

I'll probably continue with removing the other property macros. These
two just happened to be the two I actually had to figure out for an
unrelated change.
---
 llvm/include/llvm/IR/VPIntrinsics.def         | 19 -----
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 25 +++++--
 llvm/lib/IR/IntrinsicInst.cpp                 | 73 ++++++++-----------
 3 files changed, 48 insertions(+), 69 deletions(-)

diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 521cbc2dc278f9..3fad00e2caf21f 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -129,11 +129,6 @@
 #define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS)
 #endif
 
-// Map this VP reduction intrinsic to its reduction operand positions.
-#ifndef VP_PROPERTY_REDUCTION
-#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS)
-#endif
-
 // A property to infer VP binary-op SDNode opcodes automatically.
 #ifndef VP_PROPERTY_BINARYOP
 #define VP_PROPERTY_BINARYOP
@@ -144,13 +139,6 @@
 #define VP_PROPERTY_CASTOP
 #endif
 
-// This VP Intrinsic is a comparison operation
-// The condition code arg is at CCPOS and accepts floating-point condition
-// codes if ISFP is set, else it accepts integer condition codes.
-#ifndef VP_PROPERTY_CMP
-#define VP_PROPERTY_CMP(CCPOS, ISFP)
-#endif
-
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -567,7 +555,6 @@ END_REGISTER_VP_SDNODE(VP_SETCC)
 BEGIN_REGISTER_VP_INTRINSIC(vp_fcmp, 3, 4)
 HELPER_MAP_VPID_TO_VPSD(vp_fcmp, VP_SETCC)
 VP_PROPERTY_FUNCTIONAL_OPC(FCmp)
-VP_PROPERTY_CMP(2, true)
 VP_PROPERTY_CONSTRAINEDFP(0, 1, experimental_constrained_fcmp)
 END_REGISTER_VP_INTRINSIC(vp_fcmp)
 
@@ -575,7 +562,6 @@ END_REGISTER_VP_INTRINSIC(vp_fcmp)
 BEGIN_REGISTER_VP_INTRINSIC(vp_icmp, 3, 4)
 HELPER_MAP_VPID_TO_VPSD(vp_icmp, VP_SETCC)
 VP_PROPERTY_FUNCTIONAL_OPC(ICmp)
-VP_PROPERTY_CMP(2, false)
 END_REGISTER_VP_INTRINSIC(vp_icmp)
 
 ///// } Comparisons
@@ -655,7 +641,6 @@ END_REGISTER_VP(vp_gather, VP_GATHER)
   BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, 1)                                       \
   VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)                                     \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC)                                          \
-  VP_PROPERTY_REDUCTION(0, 1)                                                  \
   END_REGISTER_VP(VPID, VPSD)
 
 // llvm.vp.reduce.add(start,x,mask,vlen)
@@ -725,13 +710,11 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM,
 #define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, SDOPC, SEQ_SDOPC, INTRIN) \
   BEGIN_REGISTER_VP_INTRINSIC(VPID, 2, 3)                                      \
   BEGIN_REGISTER_VP_SDNODE(VPSD, 1, VPID, 2, 3)                                \
-  VP_PROPERTY_REDUCTION(0, 1)                                                  \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC)                                          \
   END_REGISTER_VP_SDNODE(VPSD)                                                 \
   BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, 1, VPID, 2, 3)                            \
   HELPER_MAP_VPID_TO_VPSD(VPID, SEQ_VPSD)                                      \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SEQ_SDOPC)                                      \
-  VP_PROPERTY_REDUCTION(0, 1)                                                  \
   END_REGISTER_VP_SDNODE(SEQ_VPSD)                                             \
   VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)                                     \
   END_REGISTER_VP_INTRINSIC(VPID)
@@ -796,11 +779,9 @@ END_REGISTER_VP(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT)
 #undef HELPER_MAP_VPID_TO_VPSD
 #undef VP_PROPERTY_BINARYOP
 #undef VP_PROPERTY_CASTOP
-#undef VP_PROPERTY_CMP
 #undef VP_PROPERTY_CONSTRAINEDFP
 #undef VP_PROPERTY_FUNCTIONAL_INTRINSIC
 #undef VP_PROPERTY_FUNCTIONAL_OPC
 #undef VP_PROPERTY_FUNCTIONAL_SDOPC
 #undef VP_PROPERTY_NO_FUNCTIONAL
 #undef VP_PROPERTY_MEMOP
-#undef VP_PROPERTY_REDUCTION
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9c7a43064ecf62..9efcd3f25797b5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -503,13 +503,26 @@ bool ISD::isVPBinaryOp(unsigned Opcode) {
 bool ISD::isVPReduction(unsigned Opcode) {
   switch (Opcode) {
   default:
-    break;
-#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD:
-#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true;
-#define END_REGISTER_VP_SDNODE(VPSD) break;
-#include "llvm/IR/VPIntrinsics.def"
+    return false;
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+    return true;
   }
-  return false;
 }
 
 /// The operand position of the vector mask.
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 7680fd02562316..966fa62abd94fe 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -738,14 +738,25 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
 
 bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
   switch (ID) {
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_mul:
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor:
+  case Intrinsic::vp_reduce_smax:
+  case Intrinsic::vp_reduce_smin:
+  case Intrinsic::vp_reduce_umax:
+  case Intrinsic::vp_reduce_umin:
+  case Intrinsic::vp_reduce_fmax:
+  case Intrinsic::vp_reduce_fmin:
+  case Intrinsic::vp_reduce_fmaximum:
+  case Intrinsic::vp_reduce_fminimum:
+  case Intrinsic::vp_reduce_fadd:
+  case Intrinsic::vp_reduce_fmul:
+    return true;
   default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
+    return false;
   }
-  return false;
 }
 
 bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) {
@@ -763,13 +774,11 @@ bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) {
 bool VPCmpIntrinsic::isVPCmp(Intrinsic::ID ID) {
   switch (ID) {
   default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_CMP(CCPOS, ...) return true;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
+    return false;
+  case Intrinsic::vp_fcmp:
+  case Intrinsic::vp_icmp:
+    return true;
   }
-  return false;
 }
 
 bool VPBinOpIntrinsic::isVPBinOp(Intrinsic::ID ID) {
@@ -803,22 +812,10 @@ static ICmpInst::Predicate getIntPredicateFromMD(const Value *Op) {
 }
 
 CmpInst::Predicate VPCmpIntrinsic::getPredicate() const {
-  bool IsFP = true;
-  std::optional<unsigned> CCArgIdx;
-  switch (getIntrinsicID()) {
-  default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_CMP(CCPOS, ISFP)                                           \
-  CCArgIdx = CCPOS;                                                            \
-  IsFP = ISFP;                                                                 \
-  break;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
-  }
-  assert(CCArgIdx && "Unexpected vector-predicated comparison");
-  return IsFP ? getFPPredicateFromMD(getArgOperand(*CCArgIdx))
-              : getIntPredicateFromMD(getArgOperand(*CCArgIdx));
+  assert(isVPCmp(getIntrinsicID()));
+  return getIntrinsicID() == Intrinsic::vp_fcmp
+             ? getFPPredicateFromMD(getArgOperand(2))
+             : getIntPredicateFromMD(getArgOperand(2));
 }
 
 unsigned VPReductionIntrinsic::getVectorParamPos() const {
@@ -831,27 +828,15 @@ unsigned VPReductionIntrinsic::getStartParamPos() const {
 
 std::optional<unsigned>
 VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) {
-  switch (ID) {
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return VECTORPOS;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
-  default:
-    break;
-  }
+  if (isVPReduction(ID))
+    return 1;
   return std::nullopt;
 }
 
 std::optional<unsigned>
 VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) {
-  switch (ID) {
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return STARTPOS;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
-  default:
-    break;
-  }
+  if (isVPReduction(ID))
+    return 0;
   return std::nullopt;
 }
 

From eed135fea72b544426349e6461a0ca142c27967e Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Thu, 29 Aug 2024 17:56:49 +0100
Subject: [PATCH 20/72] Revert "[Analysis] Guard logf128 cst folding"

This reverts commit 42d3cccffd203ff6dc967d4243588ca466c0faf7 which
caused a test failure.
---
 llvm/lib/Analysis/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 99ce0acdabe1ba..393803fad89383 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -163,6 +163,8 @@ add_llvm_component_library(LLVMAnalysis
   TargetParser
   )
 
-if(LLVM_HAS_LOGF128)
-  target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+if(HAS_LOGF128)
+ target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
 endif()

From 178392454e076624674b4a7ddf3fc8bda2e94f0e Mon Sep 17 00:00:00 2001
From: Harini0924 <harinidonthula@google.com>
Date: Thu, 29 Aug 2024 10:00:58 -0700
Subject: [PATCH 21/72] [llvm-lit] Print environment variables when using env
 without subcommand (#98414)

This patch addresses an issue with lit's internal shell when env is
without any arguments, it fails with exit code 127 because `env`
requires a subcommand. This patch addresses the issue by encoding the
command to properly return environment variables even when no arguments
are provided.

The error occurred when running the command
` LIT_USE_INTERNAL_SHELL=1 ninja check-llvm`.

fixes: #102383
This is part of the test cleanups proposed in the RFC: [[RFC] Enabling
the Lit Internal Shell by
Default](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179)
---
 llvm/utils/lit/lit/TestRunner.py              |  11 +-
 .../env-calls-cd.txt                          |   0
 .../env-calls-colon.txt                       |   0
 .../env-calls-echo.txt                        |   0
 .../env-calls-export.txt                      |   0
 .../env-calls-mkdir.txt                       |   0
 .../env-calls-not-builtin.txt                 |   0
 .../env-calls-rm.txt                          |   0
 .../lit.cfg                                   |   1 +
 .../env-args-last-is-assign.txt               |   0
 .../env-args-last-is-u-arg.txt                |   0
 .../env-args-last-is-u.txt                    |   0
 .../env-args-nested-none.txt                  |   0
 .../shtest-env-positive/env-calls-env.txt     |  32 +++++
 .../shtest-env-positive/env-no-subcommand.txt |  37 +++++
 .../Inputs/shtest-env-positive/env-u.txt      |  22 +++
 .../tests/Inputs/shtest-env-positive/env.txt  |  15 +++
 .../tests/Inputs/shtest-env-positive/lit.cfg  |  11 ++
 .../Inputs/shtest-env-positive/mixed.txt      |  18 +++
 .../tests/Inputs/shtest-env/env-args-none.txt |   1 -
 .../tests/Inputs/shtest-env/env-calls-env.txt |  32 -----
 .../lit/tests/Inputs/shtest-env/env-u.txt     |  23 ----
 .../utils/lit/tests/Inputs/shtest-env/env.txt |  15 ---
 .../lit/tests/Inputs/shtest-env/mixed.txt     |  18 ---
 .../Inputs/shtest-env/print_environment.py    |   9 --
 llvm/utils/lit/tests/shtest-env-negative.py   |  49 +++++++
 llvm/utils/lit/tests/shtest-env-positive.py   |  70 ++++++++++
 llvm/utils/lit/tests/shtest-env.py            | 126 ------------------
 28 files changed, 265 insertions(+), 225 deletions(-)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-cd.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-colon.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-echo.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-export.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-mkdir.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-not-builtin.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-rm.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/lit.cfg (90%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-assign.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-u-arg.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-u.txt (100%)
 rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-nested-none.txt (100%)
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt
 delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py
 create mode 100644 llvm/utils/lit/tests/shtest-env-negative.py
 create mode 100644 llvm/utils/lit/tests/shtest-env-positive.py
 delete mode 100644 llvm/utils/lit/tests/shtest-env.py

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 223a6c6e4ca0a2..19f35fc7e212f3 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -742,7 +742,16 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
                     cmd_shenv = ShellEnvironment(shenv.cwd, shenv.env)
                 args = updateEnv(cmd_shenv, args)
                 if not args:
-                    raise InternalShellError(j, "Error: 'env' requires a" " subcommand")
+                    # Return the environment variables if no argument is provided.
+                    env_str = "\n".join(
+                        f"{key}={value}" for key, value in sorted(cmd_shenv.env.items())
+                    )
+                    results.append(
+                        ShellCommandResult(
+                            j, env_str, "", 0, timeoutHelper.timeoutReached(), []
+                        )
+                    )
+                    return 0
             elif args[0] == "not":
                 not_args.append(args.pop(0))
                 not_count += 1
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-cd.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-cd.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-cd.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-cd.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-colon.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-colon.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-colon.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-colon.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-echo.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-echo.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-echo.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-export.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-export.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-export.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-export.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-mkdir.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-mkdir.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-mkdir.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-not-builtin.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-not-builtin.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-not-builtin.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-not-builtin.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-rm.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-rm.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-rm.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-rm.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg
similarity index 90%
rename from llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg
rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg
index df9df7da81daaf..626c00f71d7287 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg
@@ -7,4 +7,5 @@ config.test_source_root = None
 config.test_exec_root = None
 config.environment["FOO"] = "1"
 config.environment["BAR"] = "2"
+config.environment["QUX"] = "3"
 config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-assign.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-assign.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-assign.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-assign.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u-arg.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u-arg.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u-arg.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u-arg.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-nested-none.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-nested-none.txt
similarity index 100%
rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-nested-none.txt
rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-nested-none.txt
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt
new file mode 100644
index 00000000000000..ee40c60a1e4b65
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt
@@ -0,0 +1,32 @@
+## Tests the behaviour of chaining env commands together.
+
+## Check that internal env can call internal env.
+# RUN: env env | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s
+#
+# CHECK-2-EMPTY-ARGS: BAR = 2
+# CHECK-2-EMPTY-ARGS: FOO = 1
+
+## Check setting variables in a nested env call.
+# RUN: env FOO=2 env BAR=1 | FileCheck -check-prefix=CHECK-2-VAL %s
+#
+# CHECK-2-VAL: BAR = 1
+# CHECK-2-VAL: FOO = 2
+
+## Check unsetting variables in a nested env call.
+# RUN: env -u FOO env -u BAR | FileCheck -check-prefix=CHECK-2-U %s
+#
+# CHECK-2-U-NOT: BAR
+# CHECK-2-U-NOT: FOO
+
+## Check mixed setting and unsetting in nested env calls.
+# RUN: env -u FOO BAR=1 env -u BAR FOO=2 | FileCheck -check-prefix=CHECK-2-U-VAL %s
+#
+# CHECK-2-U-VAL-NOT: BAR
+# CHECK-2-U-VAL: FOO = 2
+
+## Check setting, unsetting, and adding a new variable in nested env calls.
+# RUN: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 | FileCheck -check-prefix=CHECK-3 %s
+#
+# CHECK-3-NOT: BAR
+# CHECK-3: BAZ = 3
+# CHECK-3: FOO = 2
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt
new file mode 100644
index 00000000000000..761a8061a0b0de
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt
@@ -0,0 +1,37 @@
+## Tests the env command in various scenarios: without arguments, setting, unsetting, and mixing envrionment variables.
+
+## Check default environment.
+# RUN: env | FileCheck -check-prefix=NO-ARGS %s
+#
+# NO-ARGS: BAR=2
+# NO-ARGS: FOO=1
+# NO-ARGS: QUX=3
+
+## Set environment variables.
+# RUN: env FOO=2 BAR=1 | FileCheck -check-prefix=SET-VAL %s
+#
+# SET-VAL: BAR=1
+# SET-VAL: FOO=2
+# SET-VAL: QUX=3
+
+## Unset environment variables.
+# RUN: env -u FOO -u BAR | FileCheck -check-prefix=UNSET-U %s
+#
+# UNSET-U-NOT: BAR
+# UNSET-U-NOT: FOO
+# UNSET-U: QUX=3
+
+## Mixed set and unset environment variables.
+# RUN: env -u FOO BAR=1 -u BAR FOO=2 | FileCheck -check-prefix=MIXED-SET-UNSET %s
+#
+# MIXED-SET-UNSET-NOT: BAR
+# MIXED-SET-UNSET: FOO=2
+# MIXED-SET-UNSET: QUX=3
+
+## Mixed set and unset with additional variable.
+# RUN: env -u FOO BAR=1 -u BAR FOO=2 BAZ=4 | FileCheck -check-prefix=MIXED-SET-UNSET-ADD-3 %s
+#
+# MIXED-SET-UNSET-ADD-NOT: BAR
+# MIXED-SET-UNSET-ADD: BAZ=4
+# MIXED-SET-UNSET-ADD: FOO=2
+# MIXED-SET-UNSET-ADD: QUX=3 
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt
new file mode 100644
index 00000000000000..2945639c0642df
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt
@@ -0,0 +1,22 @@
+## Tests env command for preset variables and handling single/multiple unsets.
+
+## Check and make sure preset environment variable were set in lit.cfg.
+#
+# RUN: env | FileCheck --check-prefix=CHECK-ENV-PRESET %s
+#
+## Check single unset of environment variable.
+#
+# RUN: env -u FOO | FileCheck --check-prefix=CHECK-ENV-UNSET-1 %s
+#
+## Check multiple unsets of environment variables.
+#
+# RUN: env -u FOO -u BAR | FileCheck --check-prefix=CHECK-ENV-UNSET-MULTIPLE %s
+
+# CHECK-ENV-PRESET: BAR = 2
+# CHECK-ENV-PRESET: FOO = 1
+
+# CHECK-ENV-UNSET-1: BAR = 2
+# CHECK-ENV-UNSET-1-NOT: FOO
+
+# CHECK-ENV-UNSET-MULTIPLE-NOT: BAR
+# CHECK-ENV-UNSET-MULTIPLE-NOT: FOO
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt
new file mode 100644
index 00000000000000..74a2a65d260f41
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt
@@ -0,0 +1,15 @@
+## Tests env command for setting single and multiple environment variables.
+
+## Check for simple one environment variable setting.
+#
+# RUN: env A_FOO=999 | FileCheck --check-prefix=CHECK-ENV-1 %s
+#
+## Check for multiple environment variable settings.
+#
+# RUN: env A_FOO=1 B_BAR=2 C_OOF=3 | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s
+
+# CHECK-ENV-1: A_FOO = 999
+
+# CHECK-ENV-MULTIPLE: A_FOO = 1
+# CHECK-ENV-MULTIPLE: B_BAR = 2
+# CHECK-ENV-MULTIPLE: C_OOF = 3
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg
new file mode 100644
index 00000000000000..626c00f71d7287
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg
@@ -0,0 +1,11 @@
+import lit.formats
+
+config.name = "shtest-env"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.environment["FOO"] = "1"
+config.environment["BAR"] = "2"
+config.environment["QUX"] = "3"
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt
new file mode 100644
index 00000000000000..c2c4e8bfdfc8bd
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt
@@ -0,0 +1,18 @@
+## Tests env command for setting and unsetting single and multiple environment variables.
+
+## Check for setting and removing one environment variable
+#
+# RUN: env A_FOO=999 -u FOO | FileCheck --check-prefix=CHECK-ENV-1 %s
+#
+## Check for setting/unsetting multiple environment variables
+#
+# RUN: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s
+
+# CHECK-ENV-1: A_FOO = 999
+# CHECK-ENV-1-NOT: FOO
+
+# CHECK-ENV-MULTIPLE: A_FOO = 1
+# CHECK-ENV-MULTIPLE-NOT: BAR
+# CHECK-ENV-MULTIPLE: B_BAR = 2
+# CHECK-ENV-MULTIPLE: C_OOF = 3
+# CHECK-ENV-MULTIPLE-NOT: FOO
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt
deleted file mode 100644
index dc5cdbad09afc9..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt
+++ /dev/null
@@ -1 +0,0 @@
-# RUN: env
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
deleted file mode 100644
index 26150c413dc03d..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# Check that internal env can call internal env.
-
-# RUN: env env %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s
-#
-# CHECK-2-EMPTY-ARGS: BAR = 2
-# CHECK-2-EMPTY-ARGS: FOO = 1
-
-# RUN: env FOO=2 env BAR=1 %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-VAL %s
-#
-# CHECK-2-VAL: BAR = 1
-# CHECK-2-VAL: FOO = 2
-
-# RUN: env -u FOO env -u BAR %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-U %s
-#
-# CHECK-2-U-NOT: BAR
-# CHECK-2-U-NOT: FOO
-
-# RUN: env -u FOO BAR=1 env -u BAR FOO=2 %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-U-VAL %s
-#
-# CHECK-2-U-VAL-NOT: BAR
-# CHECK-2-U-VAL: FOO = 2
-
-# RUN: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-3 %s
-#
-# CHECK-3-NOT: BAR
-# CHECK-3: BAZ = 3
-# CHECK-3: FOO = 2
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt
deleted file mode 100644
index 9cdf9d08850f78..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Check and make sure preset environment variable were set in lit.cfg
-#
-# RUN: %{python} print_environment.py \
-# RUN:   | FileCheck --check-prefix=CHECK-ENV-PRESET %s
-#
-# Check single unset of environment variable
-#
-# RUN: env -u FOO %{python} print_environment.py \
-# RUN:  | FileCheck --check-prefix=CHECK-ENV-UNSET-1 %s
-#
-# Check multiple unsets of environment variables
-#
-# RUN: env -u FOO -u BAR %{python} print_environment.py \
-# RUN:  | FileCheck --check-prefix=CHECK-ENV-UNSET-MULTIPLE %s
-
-# CHECK-ENV-PRESET: BAR = 2
-# CHECK-ENV-PRESET: FOO = 1
-
-# CHECK-ENV-UNSET-1: BAR = 2
-# CHECK-ENV-UNSET-1-NOT: FOO
-
-# CHECK-ENV-UNSET-MULTIPLE-NOT: BAR
-# CHECK-ENV-UNSET-MULTIPLE-NOT: FOO
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env.txt
deleted file mode 100644
index aa697b0c4081f4..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Check for simple one environment variable setting
-#
-# RUN: env A_FOO=999 %{python} print_environment.py \
-# RUN:   | FileCheck --check-prefix=CHECK-ENV-1 %s
-#
-# Check for multiple environment variable settings
-#
-# RUN: env A_FOO=1 B_BAR=2 C_OOF=3 %{python} print_environment.py \
-# RUN:   | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s
-
-# CHECK-ENV-1: A_FOO = 999
-
-# CHECK-ENV-MULTIPLE: A_FOO = 1
-# CHECK-ENV-MULTIPLE: B_BAR = 2
-# CHECK-ENV-MULTIPLE: C_OOF = 3
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt b/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt
deleted file mode 100644
index be32d458843bc3..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Check for setting and removing one environment variable
-#
-# RUN: env A_FOO=999 -u FOO %{python} print_environment.py \
-# RUN:   | FileCheck --check-prefix=CHECK-ENV-1 %s
-#
-# Check for setting/unsetting multiple environment variables
-#
-# RUN: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 %{python} print_environment.py \
-# RUN:   | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s
-
-# CHECK-ENV-1: A_FOO = 999
-# CHECK-ENV-1-NOT: FOO
-
-# CHECK-ENV-MULTIPLE: A_FOO = 1
-# CHECK-ENV-MULTIPLE-NOT: BAR
-# CHECK-ENV-MULTIPLE: B_BAR = 2
-# CHECK-ENV-MULTIPLE: C_OOF = 3
-# CHECK-ENV-MULTIPLE-NOT: FOO
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py b/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py
deleted file mode 100644
index e39bd73e44a108..00000000000000
--- a/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-import os
-
-sorted_environment = sorted(os.environ.items())
-
-for name, value in sorted_environment:
-    print(name, "=", value)
diff --git a/llvm/utils/lit/tests/shtest-env-negative.py b/llvm/utils/lit/tests/shtest-env-negative.py
new file mode 100644
index 00000000000000..c8b59b224e7c43
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-env-negative.py
@@ -0,0 +1,49 @@
+## Test the env command (failing tests).
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-env-negative \
+# RUN: | FileCheck -match-full-lines %s
+#
+# END.
+
+## Test the env command's expected failures.
+
+# CHECK: -- Testing: 7 tests{{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-cd.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 cd foobar
+# CHECK: # executed command: env -u FOO BAR=3 cd foobar
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-colon.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 :
+# CHECK: # executed command: env -u FOO BAR=3 :
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-echo.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 echo hello world
+# CHECK: # executed command: env -u FOO BAR=3 echo hello world
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-export.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 export BAZ=3
+# CHECK: # executed command: env -u FOO BAR=3 export BAZ=3
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-mkdir.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 mkdir foobar
+# CHECK: # executed command: env -u FOO BAR=3 mkdir foobar
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-not-builtin.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 not rm {{.+}}.no-such-file
+# CHECK: # executed command: env -u FOO BAR=3 not rm {{.+}}.no-such-file{{.*}}
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: FAIL: shtest-env :: env-calls-rm.txt ({{[^)]*}})
+# CHECK: env -u FOO BAR=3 rm foobar
+# CHECK: # executed command: env -u FOO BAR=3 rm foobar
+# CHECK: # error: command failed with exit status: {{.*}}
+
+# CHECK: Total Discovered Tests: 7
+# CHECK: Failed: 7 {{\([0-9]*\.[0-9]*%\)}}
+# CHECK-NOT: {{.}}
diff --git a/llvm/utils/lit/tests/shtest-env-positive.py b/llvm/utils/lit/tests/shtest-env-positive.py
new file mode 100644
index 00000000000000..863fbda8c5b6dc
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-env-positive.py
@@ -0,0 +1,70 @@
+## Test the env command (passing tests).
+
+# RUN: %{lit} -a -v %{inputs}/shtest-env-positive \
+# RUN: | FileCheck -match-full-lines %s
+#
+# END.
+
+## Test the env command's successful executions.
+
+# CHECK: -- Testing: 9 tests{{.*}}
+
+# CHECK: PASS: shtest-env :: env-args-last-is-assign.txt ({{[^)]*}})
+# CHECK: env FOO=1
+# CHECK: # executed command: env FOO=1
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-args-last-is-u-arg.txt ({{[^)]*}})
+# CHECK: env -u FOO
+# CHECK: # executed command: env -u FOO
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-args-last-is-u.txt ({{[^)]*}})
+# CHECK: env -u
+# CHECK: # executed command: env -u
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-args-nested-none.txt ({{[^)]*}})
+# CHECK: env env env
+# CHECK: # executed command: env env env
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-calls-env.txt ({{[^)]*}})
+# CHECK: env env | {{.*}}
+# CHECK: # executed command: env env
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-no-subcommand.txt ({{[^)]*}})
+# CHECK: env | {{.*}}
+# CHECK: # executed command: env
+# CHECK: env FOO=2 BAR=1 | {{.*}}
+# CHECK: # executed command: env FOO=2 BAR=1
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env-u.txt ({{[^)]*}})
+# CHECK: env -u FOO | {{.*}}
+# CHECK: # executed command: env -u FOO
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: env.txt ({{[^)]*}})
+# CHECK: env A_FOO=999 | {{.*}}
+# CHECK: # executed command: env A_FOO=999
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: PASS: shtest-env :: mixed.txt ({{[^)]*}})
+# CHECK: env A_FOO=999 -u FOO | {{.*}}
+# CHECK: # executed command: env A_FOO=999 -u FOO
+# CHECK-NOT: # error:
+# CHECK: --
+
+# CHECK: Total Discovered Tests: 9
+# CHECK: Passed: 9 {{\([0-9]*\.[0-9]*%\)}}
+# CHECK-NOT: {{.}}
diff --git a/llvm/utils/lit/tests/shtest-env.py b/llvm/utils/lit/tests/shtest-env.py
deleted file mode 100644
index 03bb4a3cae7dd1..00000000000000
--- a/llvm/utils/lit/tests/shtest-env.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Check the env command
-
-# RUN: not %{lit} -a -v %{inputs}/shtest-env \
-# RUN: | FileCheck -match-full-lines %s
-#
-# END.
-
-# Make sure env commands are included in printed commands.
-
-# CHECK: -- Testing: 16 tests{{.*}}
-
-# CHECK: FAIL: shtest-env :: env-args-last-is-assign.txt ({{[^)]*}})
-# CHECK: env FOO=1
-# CHECK: # executed command: env FOO=1
-# CHECK: # | Error: 'env' requires a subcommand
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-args-last-is-u-arg.txt ({{[^)]*}})
-# CHECK: env -u FOO
-# CHECK: # executed command: env -u FOO
-# CHECK: # | Error: 'env' requires a subcommand
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-args-last-is-u.txt ({{[^)]*}})
-# CHECK: env -u
-# CHECK: # executed command: env -u
-# CHECK: # | Error: 'env' requires a subcommand
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-args-nested-none.txt ({{[^)]*}})
-# CHECK: env env env
-# CHECK: # executed command: env env env
-# CHECK: # | Error: 'env' requires a subcommand
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-args-none.txt ({{[^)]*}})
-# CHECK: env
-# CHECK: # executed command: env
-# CHECK: # | Error: 'env' requires a subcommand
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-cd.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 cd foobar
-# CHECK: # executed command: env -u FOO BAR=3 cd foobar
-# CHECK: # | Error: 'env' cannot call 'cd'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-colon.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 :
-# CHECK: # executed command: env -u FOO BAR=3 :
-# CHECK: # | Error: 'env' cannot call ':'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-echo.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 echo hello world
-# CHECK: # executed command: env -u FOO BAR=3 echo hello world
-# CHECK: # | Error: 'env' cannot call 'echo'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: PASS: shtest-env :: env-calls-env.txt ({{[^)]*}})
-# CHECK: env env [[PYTHON:.+]] print_environment.py | {{.*}}
-# CHECK: # executed command: env env [[PYTHON_BARE:.+]] print_environment.py
-# CHECK: env FOO=2 env BAR=1 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env FOO=2 env BAR=1 [[PYTHON_BARE]] print_environment.py
-# CHECK: env -u FOO env -u BAR [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env -u FOO env -u BAR [[PYTHON_BARE]] print_environment.py
-# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON_BARE]] print_environment.py
-# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON_BARE]] print_environment.py
-# CHECK-NOT: {{^[^#]}}
-# CHECK: --
-
-# CHECK: FAIL: shtest-env :: env-calls-export.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 export BAZ=3
-# CHECK: # executed command: env -u FOO BAR=3 export BAZ=3
-# CHECK: # | Error: 'env' cannot call 'export'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-mkdir.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 mkdir foobar
-# CHECK: # executed command: env -u FOO BAR=3 mkdir foobar
-# CHECK: # | Error: 'env' cannot call 'mkdir'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-not-builtin.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 not rm {{.+}}.no-such-file
-# CHECK: # executed command: env -u FOO BAR=3 not rm {{.+}}.no-such-file{{.*}}
-# CHECK: # | Error: 'env' cannot call 'rm'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: FAIL: shtest-env :: env-calls-rm.txt ({{[^)]*}})
-# CHECK: env -u FOO BAR=3 rm foobar
-# CHECK: # executed command: env -u FOO BAR=3 rm foobar
-# CHECK: # | Error: 'env' cannot call 'rm'
-# CHECK: # error: command failed with exit status: {{.*}}
-
-# CHECK: PASS: shtest-env :: env-u.txt ({{[^)]*}})
-# CHECK: [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: env -u FOO [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env -u FOO [[PYTHON_BARE]] print_environment.py
-# CHECK: env -u FOO -u BAR [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env -u FOO -u BAR [[PYTHON_BARE]] print_environment.py
-# CHECK-NOT: {{^[^#]}}
-# CHECK: --
-
-# CHECK: PASS: shtest-env :: env.txt ({{[^)]*}})
-# CHECK: env A_FOO=999 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env A_FOO=999 [[PYTHON_BARE]] print_environment.py
-# CHECK: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON_BARE]] print_environment.py
-# CHECK-NOT: {{^[^#]}}
-# CHECK: --
-
-# CHECK: PASS: shtest-env :: mixed.txt ({{[^)]*}})
-# CHECK: env A_FOO=999 -u FOO [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env A_FOO=999 -u FOO [[PYTHON_BARE]] print_environment.py
-# CHECK: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON]] print_environment.py | {{.*}}
-# CHECK: # executed command: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON_BARE]] print_environment.py
-# CHECK-NOT: {{^[^#]}}
-# CHECK: --
-
-# CHECK: Total Discovered Tests: 16
-# CHECK: Passed:  4 {{\([0-9]*\.[0-9]*%\)}}
-# CHECK: Failed: 12 {{\([0-9]*\.[0-9]*%\)}}
-# CHECK-NOT: {{.}}

From 2dc3b509879518340b991733bfde5c7a4becd559 Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Thu, 29 Aug 2024 11:01:52 -0600
Subject: [PATCH 22/72] [HLSL] Apply NoRecurse attrib to all HLSL functions
 (#105907)

Previously, functions named "main" got the NoRecurse attribute
consistent with the behavior of C++, which HLSL largely follows.
However, standard recursion is not allowed in HLSL, so all functions
should really have this attribute. This doesn't prevent recursion, but
rather signals that these functions aren't expected to recurse.

Practically, this was done so that entry point functions named "main"
would have all have the same attributes as otherwise identical entry
points with other names.

This required small changes to the this assignment tests because they no
longer generate so many attribute sets since more of them match.

related to #105244
but done to simplify testing for #89806
---
 clang/lib/CodeGen/CodeGenFunction.cpp         | 10 +-
 .../implicit-norecurse-attrib.hlsl            | 93 +++++++++++++++++++
 .../CodeGenHLSL/this-assignment-overload.hlsl |  4 +-
 clang/test/CodeGenHLSL/this-assignment.hlsl   |  4 +-
 4 files changed, 104 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index c89eaa0f4e3bfc..a5747283e98058 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1064,13 +1064,17 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   // OpenCL C 2.0 v2.2-11 s6.9.i:
   //     Recursion is not supported.
   //
+  // HLSL
+  //     Recursion is not supported.
+  //
   // SYCL v1.2.1 s3.10:
   //     kernels cannot include RTTI information, exception classes,
   //     recursive code, virtual functions or make use of C++ libraries that
   //     are not compiled for the device.
-  if (FD && ((getLangOpts().CPlusPlus && FD->isMain()) ||
-             getLangOpts().OpenCL || getLangOpts().SYCLIsDevice ||
-             (getLangOpts().CUDA && FD->hasAttr<CUDAGlobalAttr>())))
+  if (FD &&
+      ((getLangOpts().CPlusPlus && FD->isMain()) || getLangOpts().OpenCL ||
+       getLangOpts().HLSL || getLangOpts().SYCLIsDevice ||
+       (getLangOpts().CUDA && FD->hasAttr<CUDAGlobalAttr>())))
     Fn->addFnAttr(llvm::Attribute::NoRecurse);
 
   llvm::RoundingMode RM = getLangOpts().getDefaultRoundingMode();
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
new file mode 100644
index 00000000000000..ae3a3b5f90199f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.3-library  -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.0-compute  -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Verify that a few different function types all get the NoRecurse attribute
+
+#define MAX 100
+
+struct Node {
+  uint value;
+  uint key;
+  uint left, right;
+};
+
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define noundef i32 @"?Find@@YAIY0GE@UNode@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
+// CHECK: ret i32
+// Find and return value corresponding to key in the SortedTree
+uint Find(Node SortedTree[MAX], uint key) {
+  uint nix = 0; // head
+  while(true) {
+    if (nix < 0)
+      return 0.0; // Not found
+    Node n = SortedTree[nix];
+    if (n.key == key)
+      return n.value;
+    if (key < n.key)
+      nix = n.left;
+    else
+      nix = n.right;
+  }
+}
+
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: ret i1
+// Initialize tree with given buffer
+// Imagine the inout works
+export
+bool InitTree(/*inout*/ Node tree[MAX], RWBuffer<uint4> encodedTree, uint maxDepth) {
+  uint size = pow(2.f, maxDepth) - 1;
+  if (size > MAX) return false;
+  for (uint i = 1; i < size; i++) {
+    tree[i].value = encodedTree[i].x;
+    tree[i].key   = encodedTree[i].y;
+    tree[i].left  = encodedTree[i].z;
+    tree[i].right = encodedTree[i].w;
+  }
+  return true;
+}
+
+RWBuffer<uint4> gTree;
+
+// Mangled entry points are internal
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]]
+// CHECK: ret void
+
+// Canonical entry points are external and shader attributed
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define void @main() [[EntryAttr:\#[0-9]+]]
+// CHECK: ret void
+
+[numthreads(1,1,1)]
+[shader("compute")]
+void main(uint GI : SV_GroupIndex) {
+  Node haystack[MAX];
+  uint needle = 0;
+  if (InitTree(haystack, gTree, GI))
+    needle = Find(haystack, needle);
+}
+
+// Mangled entry points are internal
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define internal void @"?defaultMain@@YAXXZ"() [[IntAttr]]
+// CHECK: ret void
+
+// Canonical entry points are external and shader attributed
+// CHECK: Function Attrs:{{.*}}norecurse
+// CHECK: define void @defaultMain() [[EntryAttr]]
+// CHECK: ret void
+
+[numthreads(1,1,1)]
+[shader("compute")]
+void defaultMain() {
+  Node haystack[MAX];
+  uint needle = 0;
+  if (InitTree(haystack, gTree, 4))
+    needle = Find(haystack, needle);
+}
+
+// CHECK: attributes [[IntAttr]] = {{.*}} norecurse
+// CHECK: attributes [[ExtAttr]] = {{.*}} norecurse
+// CHECK: attributes [[EntryAttr]] = {{.*}} norecurse
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index 0c4905e0f45980..f0affcb69a3fcd 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -25,7 +25,7 @@ void main() {
 }
 
 // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4
@@ -42,7 +42,7 @@ void main() {
 // CHECK-NEXT:%0 = load i32, ptr %First2, align 4
 // CHECK-NEXT:ret i32 %0
 
-// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
diff --git a/clang/test/CodeGenHLSL/this-assignment.hlsl b/clang/test/CodeGenHLSL/this-assignment.hlsl
index 6916afcde40546..5c8de0a18ef7ca 100644
--- a/clang/test/CodeGenHLSL/this-assignment.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment.hlsl
@@ -24,7 +24,7 @@ void main() {
 }
 
 // This tests reference like implicit this in HLSL
-// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4
@@ -34,7 +34,7 @@ void main() {
 // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false)
 // CHECK-NEXT:%First = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0
 
-// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%ref.tmp = alloca %struct.Pair, align 4

From ecd65e64e885b0fd2786ca99ea0c42d692275d91 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Thu, 29 Aug 2024 10:02:45 -0700
Subject: [PATCH 23/72] [DXIL][test] Fix a few tests now that HLSL functions
 are internalized (#106437)

These tests have been failing since db279c72f2fe "[HLSL] Change default
linkage of HLSL functions to internal (#95331)". This presumably went
unnoticed because they're not run by default since they rely on an
external tool (dxil-dis).
---
 llvm/test/tools/dxil-dis/BasicIR.ll         |  2 +-
 llvm/test/tools/dxil-dis/debug-info.ll      |  2 +-
 llvm/test/tools/dxil-dis/opaque-gep.ll      |  4 ++--
 llvm/test/tools/dxil-dis/opaque-pointers.ll | 10 +++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/tools/dxil-dis/BasicIR.ll b/llvm/test/tools/dxil-dis/BasicIR.ll
index a79365a938023d..8add2f2bd263ba 100644
--- a/llvm/test/tools/dxil-dis/BasicIR.ll
+++ b/llvm/test/tools/dxil-dis/BasicIR.ll
@@ -2,7 +2,7 @@
 
 ; RUN: llc --filetype=obj %s --stop-after=dxil-write-bitcode -o %t && llvm-bcanalyzer --dump-blockinfo %t | FileCheck %s  --check-prefix=BLOCK_INFO
 
-; CHECK: define i32 @foo(i32 %X, i32 %Y) {
+; CHECK: define internal i32 @foo(i32 %X, i32 %Y) {
 ; CHECK:   %Z = sub i32 %X, %Y
 ; CHECK:   %Q = add i32 %Z, %Y
 ; CHECK:   ret i32 %Q
diff --git a/llvm/test/tools/dxil-dis/debug-info.ll b/llvm/test/tools/dxil-dis/debug-info.ll
index 96e023338e5c26..0b40f275af3cd4 100644
--- a/llvm/test/tools/dxil-dis/debug-info.ll
+++ b/llvm/test/tools/dxil-dis/debug-info.ll
@@ -2,7 +2,7 @@
 target triple = "dxil-unknown-shadermodel6.7-library"
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-; CHECK: define float @fma(float, float, float) unnamed_addr #0 !dbg [[Fn:[!][0-9]+]]
+; CHECK: define internal float @fma(float, float, float) unnamed_addr #0 !dbg [[Fn:[!][0-9]+]]
 ; Function Attrs: norecurse nounwind readnone willreturn
 define dso_local float @fma(float %0, float %1, float %2) local_unnamed_addr #0 !dbg !6 {
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata float %0, metadata [[VarX:[!][0-9]+]], metadata [[Expr:[!][0-9]+]]), !dbg [[Line1:[!][0-9]+]]
diff --git a/llvm/test/tools/dxil-dis/opaque-gep.ll b/llvm/test/tools/dxil-dis/opaque-gep.ll
index abec0955f5a6d6..3da57e65b93c4e 100644
--- a/llvm/test/tools/dxil-dis/opaque-gep.ll
+++ b/llvm/test/tools/dxil-dis/opaque-gep.ll
@@ -7,7 +7,7 @@ define i32 @fn(ptr %0)  {
   ret i32 %3
 }
 
-; CHECK:        define i32 @fn(i32*) 
+; CHECK:        define internal i32 @fn(i32*)
 ; CHECK-NEXT:   %2 = getelementptr i32, i32* %0, i32 4
 ; CHECK-NEXT:   %3 = load i32, i32* %2, align 4
 
@@ -17,6 +17,6 @@ define i32 @fn2(ptr addrspace(1) %0)  {
   ret i32 %3
 }
 
-; CHECK:        define i32 @fn2(i32 addrspace(1)*) 
+; CHECK:        define internal i32 @fn2(i32 addrspace(1)*)
 ; CHECK-NEXT:   %2 = getelementptr i32, i32 addrspace(1)* %0, i32 4
 ; CHECK-NEXT:   %3 = load i32, i32 addrspace(1)* %2, align 4
diff --git a/llvm/test/tools/dxil-dis/opaque-pointers.ll b/llvm/test/tools/dxil-dis/opaque-pointers.ll
index 81f6a4ca97f610..87e23919629e21 100644
--- a/llvm/test/tools/dxil-dis/opaque-pointers.ll
+++ b/llvm/test/tools/dxil-dis/opaque-pointers.ll
@@ -7,7 +7,7 @@ define i64 @test(ptr %p) {
   ret i64 %v
 }
 
-; CHECK: define i64 @test(i8* %p) {
+; CHECK: define internal i64 @test(i8* %p) {
 ; CHECK-NEXT: %1 = bitcast i8* %p to i32*
 ; CHECK-NEXT: store i32 0, i32* %1, align 4
 ; CHECK-NEXT: %2 = bitcast i8* %p to i64*
@@ -19,7 +19,7 @@ define i64 @test2(ptr %p) {
   ret i64 %v
 }
 
-; CHECK: define i64 @test2(i64* %p) {
+; CHECK: define internal i64 @test2(i64* %p) {
 ; CHECK-NEXT: store i64 0, i64* %p, align 8
 ; CHECK-NEXT: %v = load i64, i64* %p, align 8
 
@@ -29,7 +29,7 @@ define i64 @test3(ptr addrspace(1) %p) {
   ret i64 %v
 }
 
-; CHECK: define i64 @test3(i8 addrspace(1)* %p) {
+; CHECK: define internal i64 @test3(i8 addrspace(1)* %p) {
 ; CHECK-NEXT: %1 = bitcast i8 addrspace(1)* %p to i32 addrspace(1)*
 ; CHECK-NEXT: store i32 0, i32 addrspace(1)* %1, align 4
 ; CHECK-NEXT: %2 = bitcast i8 addrspace(1)* %p to i64 addrspace(1)*
@@ -41,7 +41,7 @@ define i64 @test4(ptr addrspace(1) %p) {
   ret i64 %v
 }
 
-; CHECK: define i64 @test4(i64 addrspace(1)* %p) {
+; CHECK: define internal i64 @test4(i64 addrspace(1)* %p) {
 ; CHECK-NEXT: store i64 0, i64 addrspace(1)* %p, align 8
 ; CHECK-NEXT: %v = load i64, i64 addrspace(1)* %p, align 8
 
@@ -53,7 +53,7 @@ define i64 @test5(ptr %p) {
   ret i64 %v
 }
 
-; CHECK: define i64 @test5(i8* %p) {
+; CHECK: define internal i64 @test5(i8* %p) {
 ; CHECK-NEXT: %casted = addrspacecast i8* %p to i64 addrspace(1)*
 ; CHECK-NEXT: store i64 0, i64 addrspace(1)* %casted, align 8
 ; CHECK-NEXT: %v = load i64, i64 addrspace(1)* %casted, align 8

From 2ad782f49ff20d199f31cabc9baa46dba6047d8b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 10:14:57 -0700
Subject: [PATCH 24/72] [VP] Kill VP_PROPERTY_(MEMOP,CASTOP) and simplify
 _CONSTRAINEDFP [nfc] (#105574)

These lists are quite static. Heavy use of macros is undesirable, and
not idiomatic in LLVM, so let's just use the naive switch cases.

Note that the first two fields in the CONSTRAINEDFP property were
utterly unused (aside from a C++ test).

In the same vien as https://github.com/llvm/llvm-project/pull/105551.

Once both changes have landed, we'll be left with _BINARYOP which needs
a bit of additional untangling, and the actual opcode mappings.
---
 llvm/include/llvm/IR/VPIntrinsics.def | 58 +++++++--------------------
 llvm/lib/IR/IntrinsicInst.cpp         | 39 +++++++++---------
 llvm/unittests/IR/VPIntrinsicTest.cpp | 16 --------
 3 files changed, 33 insertions(+), 80 deletions(-)

diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 3fad00e2caf21f..e81752dc33a9ab 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -95,15 +95,10 @@
 #define VP_PROPERTY_FUNCTIONAL_OPC(OPC)
 #endif
 
-// Whether the intrinsic may have a rounding mode or exception behavior operand
-// bundle.
-// \p HASROUND   '1' if the intrinsic can have a rounding mode operand bundle,
-//               '0' otherwise.
-// \p HASEXCEPT  '1' if the intrinsic can have an exception behavior operand
-//               bundle, '0' otherwise.
-// \p INTRINID  The constrained fp intrinsic this VP intrinsic corresponds to.
+// If operation can have rounding or fp exceptions, maps to corresponding
+// constrained fp intrinsic.
 #ifndef VP_PROPERTY_CONSTRAINEDFP
-#define VP_PROPERTY_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID)
+#define VP_PROPERTY_CONSTRAINEDFP(INTRINID)
 #endif
 
 // The intrinsic and/or SDNode has the same function as this ISD Opcode.
@@ -123,22 +118,11 @@
 #define VP_PROPERTY_NO_FUNCTIONAL
 #endif
 
-// This VP Intrinsic is a memory operation
-// The pointer arg is at POINTERPOS and the data arg is at DATAPOS.
-#ifndef VP_PROPERTY_MEMOP
-#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS)
-#endif
-
 // A property to infer VP binary-op SDNode opcodes automatically.
 #ifndef VP_PROPERTY_BINARYOP
 #define VP_PROPERTY_BINARYOP
 #endif
 
-// A property to infer VP type casts automatically.
-#ifndef VP_PROPERTY_CASTOP
-#define VP_PROPERTY_CASTOP
-#endif
-
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -327,7 +311,7 @@ END_REGISTER_VP(vp_usub_sat, VP_USUBSAT)
 #define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, VPSD, IROPC, SDOPC)             \
   BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, VPSD, -1)                             \
   VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
-  VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX)         \
+  VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_##OPSUFFIX)         \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC)                                          \
   VP_PROPERTY_BINARYOP                                                         \
   END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
@@ -369,14 +353,14 @@ END_REGISTER_VP(vp_sqrt, VP_SQRT)
 
 // llvm.vp.fma(x,y,z,mask,vlen)
 BEGIN_REGISTER_VP(vp_fma, 3, 4, VP_FMA, -1)
-VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_fma)
+VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fma)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(fma)
 VP_PROPERTY_FUNCTIONAL_SDOPC(FMA)
 END_REGISTER_VP(vp_fma, VP_FMA)
 
 // llvm.vp.fmuladd(x,y,z,mask,vlen)
 BEGIN_REGISTER_VP(vp_fmuladd, 3, 4, VP_FMULADD, -1)
-VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_fmuladd)
+VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fmuladd)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(fmuladd)
 VP_PROPERTY_FUNCTIONAL_SDOPC(FMAD)
 END_REGISTER_VP(vp_fmuladd, VP_FMULADD)
@@ -479,31 +463,30 @@ END_REGISTER_VP(vp_llrint, VP_LLRINT)
 #error                                                                         \
     "The internal helper macro HELPER_REGISTER_FP_CAST_VP is already defined!"
 #endif
-#define HELPER_REGISTER_FP_CAST_VP(OPSUFFIX, VPSD, IROPC, SDOPC, HASROUND)     \
+#define HELPER_REGISTER_FP_CAST_VP(OPSUFFIX, VPSD, IROPC, SDOPC)               \
   BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1)                             \
   VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC)                                          \
-  VP_PROPERTY_CONSTRAINEDFP(HASROUND, 1, experimental_constrained_##OPSUFFIX)  \
-  VP_PROPERTY_CASTOP                                                           \
+  VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_##OPSUFFIX)  \
   END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
 
 // llvm.vp.fptoui(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(fptoui, VP_FP_TO_UINT, FPToUI, FP_TO_UINT, 0)
+HELPER_REGISTER_FP_CAST_VP(fptoui, VP_FP_TO_UINT, FPToUI, FP_TO_UINT)
 
 // llvm.vp.fptosi(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(fptosi, VP_FP_TO_SINT, FPToSI, FP_TO_SINT, 0)
+HELPER_REGISTER_FP_CAST_VP(fptosi, VP_FP_TO_SINT, FPToSI, FP_TO_SINT)
 
 // llvm.vp.uitofp(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(uitofp, VP_UINT_TO_FP, UIToFP, UINT_TO_FP, 1)
+HELPER_REGISTER_FP_CAST_VP(uitofp, VP_UINT_TO_FP, UIToFP, UINT_TO_FP)
 
 // llvm.vp.sitofp(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(sitofp, VP_SINT_TO_FP, SIToFP, SINT_TO_FP, 1)
+HELPER_REGISTER_FP_CAST_VP(sitofp, VP_SINT_TO_FP, SIToFP, SINT_TO_FP)
 
 // llvm.vp.fptrunc(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(fptrunc, VP_FP_ROUND, FPTrunc, FP_ROUND, 1)
+HELPER_REGISTER_FP_CAST_VP(fptrunc, VP_FP_ROUND, FPTrunc, FP_ROUND)
 
 // llvm.vp.fpext(x,mask,vlen)
-HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND, 0)
+HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND)
 
 #undef HELPER_REGISTER_FP_CAST_VP
 
@@ -517,7 +500,6 @@ HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND, 0)
   BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1)                             \
   VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
   VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC)                                          \
-  VP_PROPERTY_CASTOP                                                           \
   END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
 
 // llvm.vp.trunc(x,mask,vlen)
@@ -532,13 +514,11 @@ HELPER_REGISTER_INT_CAST_VP(sext, VP_SIGN_EXTEND, SExt, SIGN_EXTEND)
 // llvm.vp.ptrtoint(x,mask,vlen)
 BEGIN_REGISTER_VP(vp_ptrtoint, 1, 2, VP_PTRTOINT, -1)
 VP_PROPERTY_FUNCTIONAL_OPC(PtrToInt)
-VP_PROPERTY_CASTOP
 END_REGISTER_VP(vp_ptrtoint, VP_PTRTOINT)
 
 // llvm.vp.inttoptr(x,mask,vlen)
 BEGIN_REGISTER_VP(vp_inttoptr, 1, 2, VP_INTTOPTR, -1)
 VP_PROPERTY_FUNCTIONAL_OPC(IntToPtr)
-VP_PROPERTY_CASTOP
 END_REGISTER_VP(vp_inttoptr, VP_INTTOPTR)
 
 #undef HELPER_REGISTER_INT_CAST_VP
@@ -555,7 +535,7 @@ END_REGISTER_VP_SDNODE(VP_SETCC)
 BEGIN_REGISTER_VP_INTRINSIC(vp_fcmp, 3, 4)
 HELPER_MAP_VPID_TO_VPSD(vp_fcmp, VP_SETCC)
 VP_PROPERTY_FUNCTIONAL_OPC(FCmp)
-VP_PROPERTY_CONSTRAINEDFP(0, 1, experimental_constrained_fcmp)
+VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fcmp)
 END_REGISTER_VP_INTRINSIC(vp_fcmp)
 
 // llvm.vp.icmp(x,y,cc,mask,vlen)
@@ -579,7 +559,6 @@ BEGIN_REGISTER_VP_SDNODE(VP_STORE, 1, vp_store, 4, 5)
 HELPER_MAP_VPID_TO_VPSD(vp_store, VP_STORE)
 VP_PROPERTY_FUNCTIONAL_OPC(Store)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_store)
-VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_store, VP_STORE)
 
 // llvm.experimental.vp.strided.store(val,ptr,stride,mask,vlen)
@@ -588,7 +567,6 @@ BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_store, 3, 4)
 VP_PROPERTY_NO_FUNCTIONAL
 BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_STORE, 1, experimental_vp_strided_store, 5, 6)
 HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE)
-VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE)
 
 // llvm.vp.scatter(ptr,val,mask,vlen)
@@ -597,7 +575,6 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3)
 BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, 1, vp_scatter, 5, 6)
 HELPER_MAP_VPID_TO_VPSD(vp_scatter, VP_SCATTER)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_scatter)
-VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_scatter, VP_SCATTER)
 
 // llvm.vp.load(ptr,mask,vlen)
@@ -607,7 +584,6 @@ BEGIN_REGISTER_VP_SDNODE(VP_LOAD, -1, vp_load, 3, 4)
 HELPER_MAP_VPID_TO_VPSD(vp_load, VP_LOAD)
 VP_PROPERTY_FUNCTIONAL_OPC(Load)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load)
-VP_PROPERTY_MEMOP(0, std::nullopt)
 END_REGISTER_VP(vp_load, VP_LOAD)
 
 // llvm.experimental.vp.strided.load(ptr,stride,mask,vlen)
@@ -616,7 +592,6 @@ BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3)
 VP_PROPERTY_NO_FUNCTIONAL
 BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_LOAD, -1, experimental_vp_strided_load, 4, 5)
 HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD)
-VP_PROPERTY_MEMOP(0, std::nullopt)
 END_REGISTER_VP(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD)
 
 // llvm.vp.gather(ptr,mask,vlen)
@@ -625,7 +600,6 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_gather, 1, 2)
 BEGIN_REGISTER_VP_SDNODE(VP_GATHER, -1, vp_gather, 4, 5)
 HELPER_MAP_VPID_TO_VPSD(vp_gather, VP_GATHER)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_gather)
-VP_PROPERTY_MEMOP(0, std::nullopt)
 END_REGISTER_VP(vp_gather, VP_GATHER)
 
 ///// } Memory Operations
@@ -778,10 +752,8 @@ END_REGISTER_VP(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT)
 #undef END_REGISTER_VP_SDNODE
 #undef HELPER_MAP_VPID_TO_VPSD
 #undef VP_PROPERTY_BINARYOP
-#undef VP_PROPERTY_CASTOP
 #undef VP_PROPERTY_CONSTRAINEDFP
 #undef VP_PROPERTY_FUNCTIONAL_INTRINSIC
 #undef VP_PROPERTY_FUNCTIONAL_OPC
 #undef VP_PROPERTY_FUNCTIONAL_SDOPC
 #undef VP_PROPERTY_NO_FUNCTIONAL
-#undef VP_PROPERTY_MEMOP
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 966fa62abd94fe..7ed82c2ece464a 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -479,13 +479,16 @@ std::optional<unsigned>
 VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) {
   switch (VPID) {
   default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_MEMOP(POINTERPOS, ...) return POINTERPOS;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
+    return std::nullopt;
+  case Intrinsic::vp_store:
+  case Intrinsic::vp_scatter:
+  case Intrinsic::experimental_vp_strided_store:
+    return 1;
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_gather:
+  case Intrinsic::experimental_vp_strided_load:
+    return 0;
   }
-  return std::nullopt;
 }
 
 /// \return The data (payload) operand of this store or scatter.
@@ -499,13 +502,12 @@ Value *VPIntrinsic::getMemoryDataParam() const {
 std::optional<unsigned> VPIntrinsic::getMemoryDataParamPos(Intrinsic::ID VPID) {
   switch (VPID) {
   default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) return DATAPOS;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
+    return std::nullopt;
+  case Intrinsic::vp_store:
+  case Intrinsic::vp_scatter:
+  case Intrinsic::experimental_vp_strided_store:
+    return 0;
   }
-  return std::nullopt;
 }
 
 constexpr bool isVPIntrinsic(Intrinsic::ID ID) {
@@ -589,7 +591,7 @@ VPIntrinsic::getConstrainedIntrinsicIDForVP(Intrinsic::ID ID) {
   default:
     break;
 #define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_CONSTRAINEDFP(HASRND, HASEXCEPT, CID) return Intrinsic::CID;
+#define VP_PROPERTY_CONSTRAINEDFP(CID) return Intrinsic::CID;
 #define END_REGISTER_VP_INTRINSIC(VPID) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
@@ -760,14 +762,9 @@ bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
 }
 
 bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) {
-  switch (ID) {
-  default:
-    break;
-#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
-#define VP_PROPERTY_CASTOP return true;
-#define END_REGISTER_VP_INTRINSIC(VPID) break;
-#include "llvm/IR/VPIntrinsics.def"
-  }
+  // All of the vp.casts correspond to instructions
+  if (std::optional<unsigned> Opc = getFunctionalOpcodeForVP(ID))
+    return Instruction::isCast(*Opc);
   return false;
 }
 
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index cf0a10d1f2e959..925a69bafa07ef 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -454,22 +454,6 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
   }
 }
 
-/// Check that the HANDLE_VP_TO_CONSTRAINEDFP maps to an existing intrinsic with
-/// the right amount of constrained-fp metadata args.
-TEST_F(VPIntrinsicTest, HandleToConstrainedFP) {
-#define VP_PROPERTY_CONSTRAINEDFP(HASROUND, HASEXCEPT, CFPID)                  \
-  {                                                                            \
-    SmallVector<Intrinsic::IITDescriptor, 5> T;                                \
-    Intrinsic::getIntrinsicInfoTableEntries(Intrinsic::CFPID, T);              \
-    unsigned NumMetadataArgs = 0;                                              \
-    for (auto TD : T)                                                          \
-      NumMetadataArgs += (TD.Kind == Intrinsic::IITDescriptor::Metadata);      \
-    bool IsCmp = Intrinsic::CFPID == Intrinsic::experimental_constrained_fcmp; \
-    ASSERT_EQ(NumMetadataArgs, (unsigned)(IsCmp + HASROUND + HASEXCEPT));      \
-  }
-#include "llvm/IR/VPIntrinsics.def"
-}
-
 } // end anonymous namespace
 
 /// Check various properties of VPReductionIntrinsics

From 0a00d32c5f88fce89006dcde6e235bc77d7b495e Mon Sep 17 00:00:00 2001
From: Jordan R AW <ajordanr@google.com>
Date: Thu, 29 Aug 2024 10:16:17 -0700
Subject: [PATCH 25/72] [lldb] Add armv7a and armv8a ArchSpecs (#106433)

armv7a and armv8a are common names for the application subarch for arm.

These names in particular are used in ChromeOS, Android, and a few other
known applications. In ChromeOS, we encountered a bug where armv7a arch
was not recognised and segfaulted when starting an executable on an
arm32 device.

Google Issue Tracker:
https://issuetracker.google.com/361414339
---
 lldb/include/lldb/Utility/ArchSpec.h | 2 ++
 lldb/source/Utility/ArchSpec.cpp     | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 50830b889b9115..5990f984b09e2d 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -123,6 +123,7 @@ class ArchSpec {
     eCore_arm_armv6,
     eCore_arm_armv6m,
     eCore_arm_armv7,
+    eCore_arm_armv7a,
     eCore_arm_armv7l,
     eCore_arm_armv7f,
     eCore_arm_armv7s,
@@ -145,6 +146,7 @@ class ArchSpec {
     eCore_thumbv7em,
     eCore_arm_arm64,
     eCore_arm_armv8,
+    eCore_arm_armv8a,
     eCore_arm_armv8l,
     eCore_arm_arm64e,
     eCore_arm_arm64_32,
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 4fd1a800023ce3..85bb85044ec156 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -60,6 +60,8 @@ static const CoreDefinition g_core_definitions[] = {
      "armv6m"},
     {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7,
      "armv7"},
+    {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7a,
+     "armv7a"},
     {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7l,
      "armv7l"},
     {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7f,
@@ -102,6 +104,8 @@ static const CoreDefinition g_core_definitions[] = {
      ArchSpec::eCore_arm_arm64, "arm64"},
     {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64,
      ArchSpec::eCore_arm_armv8, "armv8"},
+    {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64,
+     ArchSpec::eCore_arm_armv8a, "armv8a"},
     {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv8l,
      "armv8l"},
     {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64,

From ed37b5f6c341a2c72d1f5f0c016f0f8a41a9bf83 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 29 Aug 2024 10:30:11 -0700
Subject: [PATCH 26/72] Revert "[Support] Validate number of arguments passed
 to formatv()" (#106589)

Reverts llvm/llvm-project#105745

Some bots are broken apparently.
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   |  5 +-
 llvm/benchmarks/CMakeLists.txt                |  1 -
 llvm/benchmarks/FormatVariadicBM.cpp          | 63 --------------
 llvm/include/llvm/Support/FormatVariadic.h    | 39 ++++-----
 llvm/lib/Support/FormatVariadic.cpp           | 85 +++----------------
 llvm/unittests/Support/FormatVariadicTest.cpp | 56 ++++++------
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        |  4 +-
 7 files changed, 58 insertions(+), 195 deletions(-)
 delete mode 100644 llvm/benchmarks/FormatVariadicBM.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 4f30b2a0e7e7da..8f4bd17afc8581 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -1401,10 +1401,7 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call,
         ErrnoNote =
             llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote);
     } else {
-      // Disable formatv() validation as the case note may not always have the
-      // {0} placeholder for function name.
-      CaseNote =
-          llvm::formatv(false, Case.getNote().str().c_str(), FunctionName);
+      CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName);
     }
     const SVal RV = Call.getReturnValue();
 
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index e3366e6f3ffe19..713d4ccd3c5975 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -5,4 +5,3 @@ set(LLVM_LINK_COMPONENTS
 add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
-add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
deleted file mode 100644
index c03ead400d0d5c..00000000000000
--- a/llvm/benchmarks/FormatVariadicBM.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using namespace std;
-
-// Generate a list of format strings that have `NumReplacements` replacements
-// by permuting the replacements and some literal text.
-static vector<string> getFormatStrings(int NumReplacements) {
-  vector<string> Components;
-  for (int I = 0; I < NumReplacements; I++)
-    Components.push_back("{" + to_string(I) + "}");
-  // Intersperse these with some other literal text (_).
-  const string_view Literal = "____";
-  for (char C : Literal)
-    Components.push_back(string(1, C));
-
-  vector<string> Formats;
-  do {
-    string Concat;
-    for (const string &C : Components)
-      Concat += C;
-    Formats.emplace_back(Concat);
-  } while (next_permutation(Components.begin(), Components.end()));
-  return Formats;
-}
-
-// Generate the set of formats to exercise outside the benchmark code.
-static const vector<vector<string>> Formats = {
-    getFormatStrings(1), getFormatStrings(2), getFormatStrings(3),
-    getFormatStrings(4), getFormatStrings(5),
-};
-
-// Benchmark formatv() for a variety of format strings and 1-5 replacements.
-static void BM_FormatVariadic(benchmark::State &state) {
-  for (auto _ : state) {
-    for (const string &Fmt : Formats[0])
-      formatv(Fmt.c_str(), 1).str();
-    for (const string &Fmt : Formats[1])
-      formatv(Fmt.c_str(), 1, 2).str();
-    for (const string &Fmt : Formats[2])
-      formatv(Fmt.c_str(), 1, 2, 3).str();
-    for (const string &Fmt : Formats[3])
-      formatv(Fmt.c_str(), 1, 2, 3, 4).str();
-    for (const string &Fmt : Formats[4])
-      formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str();
-  }
-}
-
-BENCHMARK(BM_FormatVariadic);
-
-BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index f31ad70021579e..595f2cf559a428 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -67,20 +67,23 @@ class formatv_object_base {
 protected:
   StringRef Fmt;
   ArrayRef<support::detail::format_adapter *> Adapters;
-  bool Validate;
+
+  static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                                 size_t &Align, char &Pad);
+
+  static std::pair<ReplacementItem, StringRef>
+  splitLiteralAndReplacement(StringRef Fmt);
 
   formatv_object_base(StringRef Fmt,
-                      ArrayRef<support::detail::format_adapter *> Adapters,
-                      bool Validate)
-      : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {}
+                      ArrayRef<support::detail::format_adapter *> Adapters)
+      : Fmt(Fmt), Adapters(Adapters) {}
 
   formatv_object_base(formatv_object_base const &rhs) = delete;
   formatv_object_base(formatv_object_base &&rhs) = default;
 
 public:
   void format(raw_ostream &S) const {
-    const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate);
-    for (const auto &R : Replacements) {
+    for (auto &R : parseFormatString(Fmt)) {
       if (R.Type == ReplacementType::Empty)
         continue;
       if (R.Type == ReplacementType::Literal) {
@@ -98,10 +101,9 @@ class formatv_object_base {
       Align.format(S, R.Options);
     }
   }
+  static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt);
 
-  // Parse and optionally validate format string (in debug builds).
-  static SmallVector<ReplacementItem, 2>
-  parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate);
+  static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec);
 
   std::string str() const {
     std::string Result;
@@ -147,8 +149,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
   };
 
 public:
-  formatv_object(StringRef Fmt, Tuple &&Params, bool Validate)
-      : formatv_object_base(Fmt, ParameterPointers, Validate),
+  formatv_object(StringRef Fmt, Tuple &&Params)
+      : formatv_object_base(Fmt, ParameterPointers),
         Parameters(std::move(Params)) {
     ParameterPointers = std::apply(create_adapters(), Parameters);
   }
@@ -245,22 +247,15 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
 // assertion.  Otherwise, it will try to do something reasonable, but in general
 // the details of what that is are undefined.
 //
-
-// formatv() with validation enable/disable controlled by the first argument.
 template <typename... Ts>
-inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
+inline auto formatv(const char *Fmt, Ts &&...Vals)
     -> formatv_object<decltype(std::make_tuple(
         support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
   using ParamTuple = decltype(std::make_tuple(
       support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
-  auto Params = std::make_tuple(
-      support::detail::build_format_adapter(std::forward<Ts>(Vals))...);
-  return formatv_object<ParamTuple>(Fmt, std::move(Params), Validate);
-}
-
-// formatv() with validation enabled.
-template <typename... Ts> inline auto formatv(const char *Fmt, Ts &&...Vals) {
-  return formatv<Ts...>(true, Fmt, std::forward<Ts>(Vals)...);
+  return formatv_object<ParamTuple>(
+      Fmt, std::make_tuple(support::detail::build_format_adapter(
+               std::forward<Ts>(Vals))...));
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index 26d2b549136e43..e25d036cdf1e8c 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -25,8 +25,8 @@ static std::optional<AlignStyle> translateLocChar(char C) {
   LLVM_BUILTIN_UNREACHABLE;
 }
 
-static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                               size_t &Align, char &Pad) {
+bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                                             size_t &Align, char &Pad) {
   Where = AlignStyle::Right;
   Align = 0;
   Pad = ' ';
@@ -35,7 +35,8 @@ static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
 
   if (Spec.size() > 1) {
     // A maximum of 2 characters at the beginning can be used for something
-    // other than the width.
+    // other
+    // than the width.
     // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...]
     // contains the width.
     // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width.
@@ -54,7 +55,8 @@ static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
   return !Failed;
 }
 
-static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
+std::optional<ReplacementItem>
+formatv_object_base::parseReplacementItem(StringRef Spec) {
   StringRef RepString = Spec.trim("{}");
 
   // If the replacement sequence does not start with a non-negative integer,
@@ -80,14 +82,15 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
     RepString = StringRef();
   }
   RepString = RepString.trim();
-  assert(RepString.empty() &&
-         "Unexpected characters found in replacement string!");
+  if (!RepString.empty()) {
+    assert(false && "Unexpected characters found in replacement string!");
+  }
 
   return ReplacementItem{Spec, Index, Align, Where, Pad, Options};
 }
 
-static std::pair<ReplacementItem, StringRef>
-splitLiteralAndReplacement(StringRef Fmt) {
+std::pair<ReplacementItem, StringRef>
+formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
   while (!Fmt.empty()) {
     // Everything up until the first brace is a literal.
     if (Fmt.front() != '{') {
@@ -140,77 +143,15 @@ splitLiteralAndReplacement(StringRef Fmt) {
   return std::make_pair(ReplacementItem{Fmt}, StringRef());
 }
 
-#ifndef NDEBUG
-#define ENABLE_VALIDATION 1
-#else
-#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode.
-#endif
-
 SmallVector<ReplacementItem, 2>
-formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs,
-                                       bool Validate) {
+formatv_object_base::parseFormatString(StringRef Fmt) {
   SmallVector<ReplacementItem, 2> Replacements;
-
-#if ENABLE_VALIDATION
-  const StringRef SavedFmtStr = Fmt;
-  size_t NumExpectedArgs = 0;
-#endif
-
+  ReplacementItem I;
   while (!Fmt.empty()) {
-    ReplacementItem I;
     std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt);
     if (I.Type != ReplacementType::Empty)
       Replacements.push_back(I);
-#if ENABLE_VALIDATION
-    if (I.Type == ReplacementType::Format)
-      NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1);
-#endif
-  }
-
-#if ENABLE_VALIDATION
-  if (!Validate)
-    return Replacements;
-
-  // Perform additional validation. Verify that the number of arguments matches
-  // the number of replacement indices and that there are no holes in the
-  // replacement indices.
-
-  // When validation fails, return an array of replacement items that
-  // will print an error message as the outout of this formatv() (used when
-  // validation is enabled in release mode).
-  auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) {
-    return SmallVector<ReplacementItem, 2>{
-        ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg),
-        ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)};
-  };
-
-  if (NumExpectedArgs != NumArgs) {
-    errs() << formatv(
-        "Expected {0} Args, but got {1} for format string '{2}'\n",
-        NumExpectedArgs, NumArgs, SavedFmtStr);
-    assert(0 && "Invalid formatv() call");
-    return getErrorReplacements("Unexpected number of arguments");
-  }
-
-  // Find the number of unique indices seen. All replacement indices
-  // are < NumExpectedArgs.
-  SmallVector<bool> Indices(NumExpectedArgs);
-  size_t Count = 0;
-  for (const ReplacementItem &I : Replacements) {
-    if (I.Type != ReplacementType::Format || Indices[I.Index])
-      continue;
-    Indices[I.Index] = true;
-    ++Count;
-  }
-
-  if (Count != NumExpectedArgs) {
-    errs() << formatv(
-        "Replacement field indices cannot have holes for format string '{0}'\n",
-        SavedFmtStr);
-    assert(0 && "Invalid format string");
-    return getErrorReplacements("Replacement indices have holes");
   }
-#endif // ENABLE_VALIDATION
   return Replacements;
 }
 
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index 6ee0d924867419..a78b25c53d7e43 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -9,11 +9,9 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatAdapters.h"
-#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
-using ::testing::HasSubstr;
 
 // Compile-time tests templates in the detail namespace.
 namespace {
@@ -37,19 +35,14 @@ struct NoFormat {};
 static_assert(uses_missing_provider<NoFormat>::value, "");
 }
 
-// Helper to parse format string with no validation.
-static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt) {
-  return formatv_object_base::parseFormatString(Fmt, 0, false);
-}
-
 TEST(FormatVariadicTest, EmptyFormatString) {
-  auto Replacements = parseFormatString("");
+  auto Replacements = formatv_object_base::parseFormatString("");
   EXPECT_EQ(0U, Replacements.size());
 }
 
 TEST(FormatVariadicTest, NoReplacements) {
   const StringRef kFormatString = "This is a test";
-  auto Replacements = parseFormatString(kFormatString);
+  auto Replacements = formatv_object_base::parseFormatString(kFormatString);
   ASSERT_EQ(1U, Replacements.size());
   EXPECT_EQ(kFormatString, Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -57,25 +50,25 @@ TEST(FormatVariadicTest, NoReplacements) {
 
 TEST(FormatVariadicTest, EscapedBrace) {
   // {{ should be replaced with {
-  auto Replacements = parseFormatString("{{");
+  auto Replacements = formatv_object_base::parseFormatString("{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // An even number N of braces should be replaced with N/2 braces.
-  Replacements = parseFormatString("{{{{{{");
+  Replacements = formatv_object_base::parseFormatString("{{{{{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{{{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = parseFormatString("}");
+  Replacements = formatv_object_base::parseFormatString("}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = parseFormatString("}}}");
+  Replacements = formatv_object_base::parseFormatString("}}}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}}}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -83,14 +76,14 @@ TEST(FormatVariadicTest, EscapedBrace) {
 
 TEST(FormatVariadicTest, ValidReplacementSequence) {
   // 1. Simple replacement - parameter index only
-  auto Replacements = parseFormatString("{0}");
+  auto Replacements = formatv_object_base::parseFormatString("{0}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
   EXPECT_EQ(0u, Replacements[0].Align);
   EXPECT_EQ("", Replacements[0].Options);
 
-  Replacements = parseFormatString("{1}");
+  Replacements = formatv_object_base::parseFormatString("{1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(1u, Replacements[0].Index);
@@ -99,7 +92,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 2. Parameter index with right alignment
-  Replacements = parseFormatString("{0,3}");
+  Replacements = formatv_object_base::parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -108,7 +101,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. And left alignment
-  Replacements = parseFormatString("{0,-3}");
+  Replacements = formatv_object_base::parseFormatString("{0,-3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -117,7 +110,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. And center alignment
-  Replacements = parseFormatString("{0,=3}");
+  Replacements = formatv_object_base::parseFormatString("{0,=3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -126,7 +119,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. Parameter index with option string
-  Replacements = parseFormatString("{0:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -135,7 +128,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 5. Parameter index with alignment before option string
-  Replacements = parseFormatString("{0,-3:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,-3:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -144,7 +137,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 7. Parameter indices, options, and alignment can all have whitespace.
-  Replacements = parseFormatString("{ 0, -3 : foo }");
+  Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -154,7 +147,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
   // 8. Everything after the first option specifier is part of the style, even
   // if it contains another option specifier.
-  Replacements = parseFormatString("{0:0:1}");
+  Replacements = formatv_object_base::parseFormatString("{0:0:1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0:0:1", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -164,7 +157,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0:1", Replacements[0].Options);
 
   // 9. Custom padding character
-  Replacements = parseFormatString("{0,p+4:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,p+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -175,7 +168,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // Format string special characters are allowed as padding character
-  Replacements = parseFormatString("{0,-+4:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,-+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -185,7 +178,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('-', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = parseFormatString("{0,+-4:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,+-4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -195,7 +188,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('+', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = parseFormatString("{0,==4:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,==4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,==4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -205,7 +198,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('=', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = parseFormatString("{0,:=4:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,:=4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -218,7 +211,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
 TEST(FormatVariadicTest, DefaultReplacementValues) {
   // 2. If options string is missing, it defaults to empty.
-  auto Replacements = parseFormatString("{0,3}");
+  auto Replacements = formatv_object_base::parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -226,7 +219,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // Including if the colon is present but contains no text.
-  Replacements = parseFormatString("{0,3:}");
+  Replacements = formatv_object_base::parseFormatString("{0,3:}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -234,7 +227,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. If alignment is missing, it defaults to 0, right, space
-  Replacements = parseFormatString("{0:foo}");
+  Replacements = formatv_object_base::parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
@@ -245,7 +238,8 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
 }
 
 TEST(FormatVariadicTest, MultipleReplacements) {
-  auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}");
+  auto Replacements =
+      formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}");
   ASSERT_EQ(5u, Replacements.size());
   // {0}
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 7016fe41ca75d0..82f8718fc556ad 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
           dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode;
       TypeSwitch<FormatElement *>(dir->getArg())
           .Case<OperandVariable, ResultVariable>([&](auto operand) {
-            body << formatv(false, parserCode,
+            body << formatv(parserCode,
                             operand->getVar()->constraint.getCppType(),
                             listName);
           })
           .Default([&](auto operand) {
-            body << formatv(false, parserCode, "::mlir::Type", listName);
+            body << formatv(parserCode, "::mlir::Type", listName);
           });
     }
   } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {

From 67ffd1438379ee43f678f3e7752f4ec5f777cee4 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 29 Aug 2024 14:36:39 -0300
Subject: [PATCH 27/72] libcxx: [NFC] relax error expectation for clang
 diagnostics (#106591)

This is a split-off from #96023, where this change has already been
reviewed by libcxx maintainers.

This will prevent that PR from triggering libcxx-ci from now on.
---
 libcxx/test/libcxx/type_traits/is_specialization.verify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
index a798647d56ee1c..3593c2e095db91 100644
--- a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp
@@ -17,5 +17,5 @@
 #include <array>
 #include <utility>
 
-// expected-error@+1 {{template template argument has different template parameters than its corresponding template template parameter}}
+// expected-error-re@*:* {{{{could not match _Size against 'type-parameter-0-0'|different template parameters}}}}
 static_assert(!std::__is_specialization_v<std::pair<int, std::size_t>, std::array>);

From 9ce4af5cadc24060f3c3674e01902d374afea983 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 29 Aug 2024 10:39:40 -0700
Subject: [PATCH 28/72] Revert "Revert "[Support] Validate number of arguments
 passed to formatv()"" (#106592)

Reverts llvm/llvm-project#106589
The fix for bot failures caused by the reverted commit was committed
already, so this revert is not needed.
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   |  5 +-
 llvm/benchmarks/CMakeLists.txt                |  1 +
 llvm/benchmarks/FormatVariadicBM.cpp          | 63 ++++++++++++++
 llvm/include/llvm/Support/FormatVariadic.h    | 39 +++++----
 llvm/lib/Support/FormatVariadic.cpp           | 85 ++++++++++++++++---
 llvm/unittests/Support/FormatVariadicTest.cpp | 56 ++++++------
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        |  4 +-
 7 files changed, 195 insertions(+), 58 deletions(-)
 create mode 100644 llvm/benchmarks/FormatVariadicBM.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 8f4bd17afc8581..4f30b2a0e7e7da 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -1401,7 +1401,10 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call,
         ErrnoNote =
             llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote);
     } else {
-      CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName);
+      // Disable formatv() validation as the case note may not always have the
+      // {0} placeholder for function name.
+      CaseNote =
+          llvm::formatv(false, Case.getNote().str().c_str(), FunctionName);
     }
     const SVal RV = Call.getReturnValue();
 
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 713d4ccd3c5975..e3366e6f3ffe19 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -5,3 +5,4 @@ set(LLVM_LINK_COMPONENTS
 add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
new file mode 100644
index 00000000000000..c03ead400d0d5c
--- /dev/null
+++ b/llvm/benchmarks/FormatVariadicBM.cpp
@@ -0,0 +1,63 @@
+//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace std;
+
+// Generate a list of format strings that have `NumReplacements` replacements
+// by permuting the replacements and some literal text.
+static vector<string> getFormatStrings(int NumReplacements) {
+  vector<string> Components;
+  for (int I = 0; I < NumReplacements; I++)
+    Components.push_back("{" + to_string(I) + "}");
+  // Intersperse these with some other literal text (_).
+  const string_view Literal = "____";
+  for (char C : Literal)
+    Components.push_back(string(1, C));
+
+  vector<string> Formats;
+  do {
+    string Concat;
+    for (const string &C : Components)
+      Concat += C;
+    Formats.emplace_back(Concat);
+  } while (next_permutation(Components.begin(), Components.end()));
+  return Formats;
+}
+
+// Generate the set of formats to exercise outside the benchmark code.
+static const vector<vector<string>> Formats = {
+    getFormatStrings(1), getFormatStrings(2), getFormatStrings(3),
+    getFormatStrings(4), getFormatStrings(5),
+};
+
+// Benchmark formatv() for a variety of format strings and 1-5 replacements.
+static void BM_FormatVariadic(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const string &Fmt : Formats[0])
+      formatv(Fmt.c_str(), 1).str();
+    for (const string &Fmt : Formats[1])
+      formatv(Fmt.c_str(), 1, 2).str();
+    for (const string &Fmt : Formats[2])
+      formatv(Fmt.c_str(), 1, 2, 3).str();
+    for (const string &Fmt : Formats[3])
+      formatv(Fmt.c_str(), 1, 2, 3, 4).str();
+    for (const string &Fmt : Formats[4])
+      formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str();
+  }
+}
+
+BENCHMARK(BM_FormatVariadic);
+
+BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index 595f2cf559a428..f31ad70021579e 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -67,23 +67,20 @@ class formatv_object_base {
 protected:
   StringRef Fmt;
   ArrayRef<support::detail::format_adapter *> Adapters;
-
-  static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                                 size_t &Align, char &Pad);
-
-  static std::pair<ReplacementItem, StringRef>
-  splitLiteralAndReplacement(StringRef Fmt);
+  bool Validate;
 
   formatv_object_base(StringRef Fmt,
-                      ArrayRef<support::detail::format_adapter *> Adapters)
-      : Fmt(Fmt), Adapters(Adapters) {}
+                      ArrayRef<support::detail::format_adapter *> Adapters,
+                      bool Validate)
+      : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {}
 
   formatv_object_base(formatv_object_base const &rhs) = delete;
   formatv_object_base(formatv_object_base &&rhs) = default;
 
 public:
   void format(raw_ostream &S) const {
-    for (auto &R : parseFormatString(Fmt)) {
+    const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate);
+    for (const auto &R : Replacements) {
       if (R.Type == ReplacementType::Empty)
         continue;
       if (R.Type == ReplacementType::Literal) {
@@ -101,9 +98,10 @@ class formatv_object_base {
       Align.format(S, R.Options);
     }
   }
-  static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt);
 
-  static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec);
+  // Parse and optionally validate format string (in debug builds).
+  static SmallVector<ReplacementItem, 2>
+  parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate);
 
   std::string str() const {
     std::string Result;
@@ -149,8 +147,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
   };
 
 public:
-  formatv_object(StringRef Fmt, Tuple &&Params)
-      : formatv_object_base(Fmt, ParameterPointers),
+  formatv_object(StringRef Fmt, Tuple &&Params, bool Validate)
+      : formatv_object_base(Fmt, ParameterPointers, Validate),
         Parameters(std::move(Params)) {
     ParameterPointers = std::apply(create_adapters(), Parameters);
   }
@@ -247,15 +245,22 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
 // assertion.  Otherwise, it will try to do something reasonable, but in general
 // the details of what that is are undefined.
 //
+
+// formatv() with validation enable/disable controlled by the first argument.
 template <typename... Ts>
-inline auto formatv(const char *Fmt, Ts &&...Vals)
+inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
     -> formatv_object<decltype(std::make_tuple(
         support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
   using ParamTuple = decltype(std::make_tuple(
       support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
-  return formatv_object<ParamTuple>(
-      Fmt, std::make_tuple(support::detail::build_format_adapter(
-               std::forward<Ts>(Vals))...));
+  auto Params = std::make_tuple(
+      support::detail::build_format_adapter(std::forward<Ts>(Vals))...);
+  return formatv_object<ParamTuple>(Fmt, std::move(Params), Validate);
+}
+
+// formatv() with validation enabled.
+template <typename... Ts> inline auto formatv(const char *Fmt, Ts &&...Vals) {
+  return formatv<Ts...>(true, Fmt, std::forward<Ts>(Vals)...);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index e25d036cdf1e8c..26d2b549136e43 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -25,8 +25,8 @@ static std::optional<AlignStyle> translateLocChar(char C) {
   LLVM_BUILTIN_UNREACHABLE;
 }
 
-bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                                             size_t &Align, char &Pad) {
+static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                               size_t &Align, char &Pad) {
   Where = AlignStyle::Right;
   Align = 0;
   Pad = ' ';
@@ -35,8 +35,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
 
   if (Spec.size() > 1) {
     // A maximum of 2 characters at the beginning can be used for something
-    // other
-    // than the width.
+    // other than the width.
     // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...]
     // contains the width.
     // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width.
@@ -55,8 +54,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
   return !Failed;
 }
 
-std::optional<ReplacementItem>
-formatv_object_base::parseReplacementItem(StringRef Spec) {
+static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
   StringRef RepString = Spec.trim("{}");
 
   // If the replacement sequence does not start with a non-negative integer,
@@ -82,15 +80,14 @@ formatv_object_base::parseReplacementItem(StringRef Spec) {
     RepString = StringRef();
   }
   RepString = RepString.trim();
-  if (!RepString.empty()) {
-    assert(false && "Unexpected characters found in replacement string!");
-  }
+  assert(RepString.empty() &&
+         "Unexpected characters found in replacement string!");
 
   return ReplacementItem{Spec, Index, Align, Where, Pad, Options};
 }
 
-std::pair<ReplacementItem, StringRef>
-formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
+static std::pair<ReplacementItem, StringRef>
+splitLiteralAndReplacement(StringRef Fmt) {
   while (!Fmt.empty()) {
     // Everything up until the first brace is a literal.
     if (Fmt.front() != '{') {
@@ -143,15 +140,77 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
   return std::make_pair(ReplacementItem{Fmt}, StringRef());
 }
 
+#ifndef NDEBUG
+#define ENABLE_VALIDATION 1
+#else
+#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode.
+#endif
+
 SmallVector<ReplacementItem, 2>
-formatv_object_base::parseFormatString(StringRef Fmt) {
+formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs,
+                                       bool Validate) {
   SmallVector<ReplacementItem, 2> Replacements;
-  ReplacementItem I;
+
+#if ENABLE_VALIDATION
+  const StringRef SavedFmtStr = Fmt;
+  size_t NumExpectedArgs = 0;
+#endif
+
   while (!Fmt.empty()) {
+    ReplacementItem I;
     std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt);
     if (I.Type != ReplacementType::Empty)
       Replacements.push_back(I);
+#if ENABLE_VALIDATION
+    if (I.Type == ReplacementType::Format)
+      NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1);
+#endif
+  }
+
+#if ENABLE_VALIDATION
+  if (!Validate)
+    return Replacements;
+
+  // Perform additional validation. Verify that the number of arguments matches
+  // the number of replacement indices and that there are no holes in the
+  // replacement indices.
+
+  // When validation fails, return an array of replacement items that
+  // will print an error message as the outout of this formatv() (used when
+  // validation is enabled in release mode).
+  auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) {
+    return SmallVector<ReplacementItem, 2>{
+        ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg),
+        ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)};
+  };
+
+  if (NumExpectedArgs != NumArgs) {
+    errs() << formatv(
+        "Expected {0} Args, but got {1} for format string '{2}'\n",
+        NumExpectedArgs, NumArgs, SavedFmtStr);
+    assert(0 && "Invalid formatv() call");
+    return getErrorReplacements("Unexpected number of arguments");
+  }
+
+  // Find the number of unique indices seen. All replacement indices
+  // are < NumExpectedArgs.
+  SmallVector<bool> Indices(NumExpectedArgs);
+  size_t Count = 0;
+  for (const ReplacementItem &I : Replacements) {
+    if (I.Type != ReplacementType::Format || Indices[I.Index])
+      continue;
+    Indices[I.Index] = true;
+    ++Count;
+  }
+
+  if (Count != NumExpectedArgs) {
+    errs() << formatv(
+        "Replacement field indices cannot have holes for format string '{0}'\n",
+        SavedFmtStr);
+    assert(0 && "Invalid format string");
+    return getErrorReplacements("Replacement indices have holes");
   }
+#endif // ENABLE_VALIDATION
   return Replacements;
 }
 
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index a78b25c53d7e43..6ee0d924867419 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -9,9 +9,11 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatAdapters.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
+using ::testing::HasSubstr;
 
 // Compile-time tests templates in the detail namespace.
 namespace {
@@ -35,14 +37,19 @@ struct NoFormat {};
 static_assert(uses_missing_provider<NoFormat>::value, "");
 }
 
+// Helper to parse format string with no validation.
+static SmallVector<ReplacementItem, 2> parseFormatString(StringRef Fmt) {
+  return formatv_object_base::parseFormatString(Fmt, 0, false);
+}
+
 TEST(FormatVariadicTest, EmptyFormatString) {
-  auto Replacements = formatv_object_base::parseFormatString("");
+  auto Replacements = parseFormatString("");
   EXPECT_EQ(0U, Replacements.size());
 }
 
 TEST(FormatVariadicTest, NoReplacements) {
   const StringRef kFormatString = "This is a test";
-  auto Replacements = formatv_object_base::parseFormatString(kFormatString);
+  auto Replacements = parseFormatString(kFormatString);
   ASSERT_EQ(1U, Replacements.size());
   EXPECT_EQ(kFormatString, Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -50,25 +57,25 @@ TEST(FormatVariadicTest, NoReplacements) {
 
 TEST(FormatVariadicTest, EscapedBrace) {
   // {{ should be replaced with {
-  auto Replacements = formatv_object_base::parseFormatString("{{");
+  auto Replacements = parseFormatString("{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // An even number N of braces should be replaced with N/2 braces.
-  Replacements = formatv_object_base::parseFormatString("{{{{{{");
+  Replacements = parseFormatString("{{{{{{");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("{{{", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = formatv_object_base::parseFormatString("}");
+  Replacements = parseFormatString("}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
 
   // } does not require doubling up.
-  Replacements = formatv_object_base::parseFormatString("}}}");
+  Replacements = parseFormatString("}}}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("}}}", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type);
@@ -76,14 +83,14 @@ TEST(FormatVariadicTest, EscapedBrace) {
 
 TEST(FormatVariadicTest, ValidReplacementSequence) {
   // 1. Simple replacement - parameter index only
-  auto Replacements = formatv_object_base::parseFormatString("{0}");
+  auto Replacements = parseFormatString("{0}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
   EXPECT_EQ(0u, Replacements[0].Align);
   EXPECT_EQ("", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{1}");
+  Replacements = parseFormatString("{1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(1u, Replacements[0].Index);
@@ -92,7 +99,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 2. Parameter index with right alignment
-  Replacements = formatv_object_base::parseFormatString("{0,3}");
+  Replacements = parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -101,7 +108,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. And left alignment
-  Replacements = formatv_object_base::parseFormatString("{0,-3}");
+  Replacements = parseFormatString("{0,-3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -110,7 +117,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. And center alignment
-  Replacements = formatv_object_base::parseFormatString("{0,=3}");
+  Replacements = parseFormatString("{0,=3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -119,7 +126,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 4. Parameter index with option string
-  Replacements = formatv_object_base::parseFormatString("{0:foo}");
+  Replacements = parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -128,7 +135,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 5. Parameter index with alignment before option string
-  Replacements = formatv_object_base::parseFormatString("{0,-3:foo}");
+  Replacements = parseFormatString("{0,-3:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -137,7 +144,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // 7. Parameter indices, options, and alignment can all have whitespace.
-  Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }");
+  Replacements = parseFormatString("{ 0, -3 : foo }");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -147,7 +154,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
   // 8. Everything after the first option specifier is part of the style, even
   // if it contains another option specifier.
-  Replacements = formatv_object_base::parseFormatString("{0:0:1}");
+  Replacements = parseFormatString("{0:0:1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0:0:1", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -157,7 +164,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0:1", Replacements[0].Options);
 
   // 9. Custom padding character
-  Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}");
+  Replacements = parseFormatString("{0,p+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,p+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -168,7 +175,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("foo", Replacements[0].Options);
 
   // Format string special characters are allowed as padding character
-  Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}");
+  Replacements = parseFormatString("{0,-+4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,-+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -178,7 +185,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('-', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}");
+  Replacements = parseFormatString("{0,+-4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,+-4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -188,7 +195,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('+', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,==4:foo}");
+  Replacements = parseFormatString("{0,==4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,==4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -198,7 +205,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ('=', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
 
-  Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}");
+  Replacements = parseFormatString("{0,:=4:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ("0,:=4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
@@ -211,7 +218,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
 
 TEST(FormatVariadicTest, DefaultReplacementValues) {
   // 2. If options string is missing, it defaults to empty.
-  auto Replacements = formatv_object_base::parseFormatString("{0,3}");
+  auto Replacements = parseFormatString("{0,3}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -219,7 +226,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // Including if the colon is present but contains no text.
-  Replacements = formatv_object_base::parseFormatString("{0,3:}");
+  Replacements = parseFormatString("{0,3:}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
@@ -227,7 +234,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. If alignment is missing, it defaults to 0, right, space
-  Replacements = formatv_object_base::parseFormatString("{0:foo}");
+  Replacements = parseFormatString("{0:foo}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
@@ -238,8 +245,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
 }
 
 TEST(FormatVariadicTest, MultipleReplacements) {
-  auto Replacements =
-      formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}");
+  auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}");
   ASSERT_EQ(5u, Replacements.size());
   // {0}
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 82f8718fc556ad..7016fe41ca75d0 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
           dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode;
       TypeSwitch<FormatElement *>(dir->getArg())
           .Case<OperandVariable, ResultVariable>([&](auto operand) {
-            body << formatv(parserCode,
+            body << formatv(false, parserCode,
                             operand->getVar()->constraint.getCppType(),
                             listName);
           })
           .Default([&](auto operand) {
-            body << formatv(parserCode, "::mlir::Type", listName);
+            body << formatv(false, parserCode, "::mlir::Type", listName);
           });
     }
   } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {

From 9a58b12fe7bf54c9433ec89bae2a2d6cfe489e75 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 29 Aug 2024 18:41:15 +0100
Subject: [PATCH 29/72] [ExtendLifetimes][NFC] Add explicit triple to new
 fake-use tests

Several tests for the new fake use intrinsic are failing on NVPTX
buildbots due to relying on behaviour for their expected triple;
this commit adds that triple to each of them to prevent failures.

Fixes commit 3d08ade (#86149).

Example buildbot failures:
https://lab.llvm.org/buildbot/#/builders/160/builds/4175
https://lab.llvm.org/buildbot/#/builders/180/builds/4173
---
 llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir | 2 +-
 llvm/test/CodeGen/X86/fake-use-scheduler.mir    | 2 +-
 llvm/test/CodeGen/X86/fake-use-tailcall.ll      | 2 +-
 llvm/test/CodeGen/X86/fake-use-vector2.ll       | 2 +-
 llvm/test/CodeGen/X86/fake-use-zero-length.ll   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
index 7eb8915f26a80f..6c2cb0e55222b2 100644
--- a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
+++ b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir
@@ -22,7 +22,7 @@
 #   return temp;
 # }
 #
-# RUN: llc -run-pass=codegenprepare -o - %s | FileCheck %s
+# RUN: llc -run-pass=codegenprepare -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s
 #
 # CHECK:      define{{.*}}foo
 # CHECK:      if.then:
diff --git a/llvm/test/CodeGen/X86/fake-use-scheduler.mir b/llvm/test/CodeGen/X86/fake-use-scheduler.mir
index 7e55f1d79aa7b6..8b82c4ed2485dc 100644
--- a/llvm/test/CodeGen/X86/fake-use-scheduler.mir
+++ b/llvm/test/CodeGen/X86/fake-use-scheduler.mir
@@ -1,5 +1,5 @@
 # Prevent the machine scheduler from moving instructions past FAKE_USE.
-# RUN: llc -run-pass machine-scheduler -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s
+# RUN: llc -run-pass machine-scheduler -mtriple=x86_64-unknown-linux -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s
 # REQUIRES: asserts
 #
 # We make sure that, beginning with the first FAKE_USE instruction,
diff --git a/llvm/test/CodeGen/X86/fake-use-tailcall.ll b/llvm/test/CodeGen/X86/fake-use-tailcall.ll
index 10bb22e1b564ab..67c28dcf1301c4 100644
--- a/llvm/test/CodeGen/X86/fake-use-tailcall.ll
+++ b/llvm/test/CodeGen/X86/fake-use-tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stop-after=finalize-isel - | FileCheck %s --implicit-check-not FAKE_USE
+; RUN: llc < %s -stop-after=finalize-isel -mtriple=x86_64-unknown-linux - | FileCheck %s --implicit-check-not FAKE_USE
 ; Fake uses following tail calls should be pulled in front
 ; of the TCRETURN instruction. Fake uses using something defined by
 ; the tail call or after it should be suppressed.
diff --git a/llvm/test/CodeGen/X86/fake-use-vector2.ll b/llvm/test/CodeGen/X86/fake-use-vector2.ll
index 6f2d3a5566dc67..190197615775a9 100644
--- a/llvm/test/CodeGen/X86/fake-use-vector2.ll
+++ b/llvm/test/CodeGen/X86/fake-use-vector2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s
+; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s
 ;
 ; Make sure we can split vectors that are used as operands of FAKE_USE.
 
diff --git a/llvm/test/CodeGen/X86/fake-use-zero-length.ll b/llvm/test/CodeGen/X86/fake-use-zero-length.ll
index e8c6791b8edff2..e3bdd2659dd913 100644
--- a/llvm/test/CodeGen/X86/fake-use-zero-length.ll
+++ b/llvm/test/CodeGen/X86/fake-use-zero-length.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stop-after=finalize-isel | FileCheck %s --implicit-check-not=FAKE_USE
+; RUN: llc < %s -stop-after=finalize-isel -mtriple=x86_64-unknown-linux | FileCheck %s --implicit-check-not=FAKE_USE
 ;
 ; Make sure SelectionDAG does not crash handling fake uses of zero-length arrays
 ; and structs. Check also that they are not propagated.

From 4bc7c74240b6f13bf421c1fef0155370b23d9fc8 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 11:00:31 -0700
Subject: [PATCH 30/72] [SLP] Extract isIdentityOrder to common routine
 [probably NFC] (#106582)

This isn't quite just code motion as the four different versions we had
of this routine differed in whether they ignored the "size" marker used
to represent undef. I doubt this matters in practice, but it is a
functional change.

---------

Co-authored-by: Alexey Bataev <a.bataev@gmx.com>
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 52 +++++++------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 81811e0a4d9295..e77db3cbd81fe5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1364,6 +1364,18 @@ class BoUpSLP {
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
+  /// Does this non-empty order represent an identity order?  Identity
+  /// should be represented as an empty order, so this is used to
+  /// decide if we can canonicalize a computed order.  Undef elements
+  /// (represented as size) are ignored.
+  bool isIdentityOrder(ArrayRef<unsigned> Order) const {
+    assert(!Order.empty() && "expected non-empty order");
+    const unsigned Sz = Order.size();
+    return all_of(enumerate(Order), [&](const auto &P) {
+      return P.value() == P.index() || P.value() == Sz;
+    });
+  }
+
   /// Checks if the specified gather tree entry \p TE can be represented as a
   /// shuffled vector entry + (possibly) permutation with other gathers. It
   /// implements the checks only for possibly ordered scalars (Loads,
@@ -5256,12 +5268,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         }
       return I1 < I2;
     };
-    auto IsIdentityOrder = [](const OrdersType &Order) {
-      for (unsigned Idx : seq<unsigned>(0, Order.size()))
-        if (Idx != Order[Idx])
-          return false;
-      return true;
-    };
     DenseMap<unsigned, unsigned> PhiToId;
     SmallVector<unsigned> Phis(TE.Scalars.size());
     std::iota(Phis.begin(), Phis.end(), 0);
@@ -5271,7 +5277,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     stable_sort(Phis, PHICompare);
     for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
       ResOrder[Id] = PhiToId[Phis[Id]];
-    if (IsIdentityOrder(ResOrder))
+    if (isIdentityOrder(ResOrder))
       return std::nullopt; // No need to reorder.
     return std::move(ResOrder);
   }
@@ -5565,19 +5571,12 @@ void BoUpSLP::reorderTopToBottom() {
     }
     if (OrdersUses.empty())
       continue;
-    auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
-      const unsigned Sz = Order.size();
-      for (unsigned Idx : seq<unsigned>(0, Sz))
-        if (Idx != Order[Idx] && Order[Idx] != Sz)
-          return false;
-      return true;
-    };
     // Choose the most used order.
     unsigned IdentityCnt = 0;
     unsigned FilledIdentityCnt = 0;
     OrdersType IdentityOrder(VF, VF);
     for (auto &Pair : OrdersUses) {
-      if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+      if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
         if (!Pair.first.empty())
           FilledIdentityCnt += Pair.second;
         IdentityCnt += Pair.second;
@@ -5593,7 +5592,7 @@ void BoUpSLP::reorderTopToBottom() {
       if (Cnt < Pair.second ||
           (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
            Cnt == Pair.second && !BestOrder.empty() &&
-           IsIdentityOrder(BestOrder))) {
+           isIdentityOrder(BestOrder))) {
         combineOrders(Pair.first, BestOrder);
         BestOrder = Pair.first;
         Cnt = Pair.second;
@@ -5602,7 +5601,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
     }
     // Set order of the user node.
-    if (IsIdentityOrder(BestOrder))
+    if (isIdentityOrder(BestOrder))
       continue;
     fixupOrderingIndices(BestOrder);
     SmallVector<int> Mask;
@@ -5891,19 +5890,12 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
           OrderedEntries.remove(Op.second);
         continue;
       }
-      auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
-        const unsigned Sz = Order.size();
-        for (unsigned Idx : seq<unsigned>(0, Sz))
-          if (Idx != Order[Idx] && Order[Idx] != Sz)
-            return false;
-        return true;
-      };
       // Choose the most used order.
       unsigned IdentityCnt = 0;
       unsigned VF = Data.second.front().second->getVectorFactor();
       OrdersType IdentityOrder(VF, VF);
       for (auto &Pair : OrdersUses) {
-        if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+        if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
           IdentityCnt += Pair.second;
           combineOrders(IdentityOrder, Pair.first);
         }
@@ -5923,7 +5915,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
       }
       // Set order of the user node.
-      if (IsIdentityOrder(BestOrder)) {
+      if (isIdentityOrder(BestOrder)) {
         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
           OrderedEntries.remove(Op.second);
         continue;
@@ -6186,13 +6178,7 @@ bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
   // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
   // reorderTopToBottom() and reorderBottomToTop(), so we are following the
   // same convention here.
-  auto IsIdentityOrder = [](const OrdersType &Order) {
-    for (unsigned Idx : seq<unsigned>(0, Order.size()))
-      if (Idx != Order[Idx])
-        return false;
-    return true;
-  };
-  if (IsIdentityOrder(ReorderIndices))
+  if (isIdentityOrder(ReorderIndices))
     ReorderIndices.clear();
 
   return true;

From fd0dbc7f4d8a5900535aa87569fbc385b7c50ba6 Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Thu, 29 Aug 2024 11:02:43 -0700
Subject: [PATCH 31/72] [DirectX] add enum for PSV resource type/kind/flag.
 (#106227)

Add ResourceType, ResourceKind and ResourceFlag enum class for PSV
resource.

This is for #103275
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  | 25 +++++++++-
 .../BinaryFormat/DXContainerConstants.def     | 46 +++++++++++++++++++
 .../include/llvm/ObjectYAML/DXContainerYAML.h |  3 ++
 llvm/lib/BinaryFormat/DXContainer.cpp         | 30 ++++++++++++
 llvm/lib/ObjectYAML/DXContainerYAML.cpp       | 18 ++++++++
 .../DXContainer/DomainMaskVectors.yaml        |  4 +-
 .../DXContainer/PSVv0-amplification.yaml      |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-compute.yaml |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-domain.yaml  |  8 ++--
 .../DXContainer/PSVv0-geometry.yaml           |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-hull.yaml    |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-mesh.yaml    |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-pixel.yaml   |  8 ++--
 .../ObjectYAML/DXContainer/PSVv0-vertex.yaml  |  8 ++--
 .../DXContainer/PSVv1-amplification.yaml      |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-compute.yaml |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-domain.yaml  |  8 ++--
 .../DXContainer/PSVv1-geometry.yaml           |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-hull.yaml    |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-mesh.yaml    |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-pixel.yaml   |  8 ++--
 .../ObjectYAML/DXContainer/PSVv1-vertex.yaml  |  8 ++--
 .../DXContainer/PSVv2-amplification.yaml      | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-compute.yaml | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-domain.yaml  | 24 +++++-----
 .../DXContainer/PSVv2-geometry.yaml           | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-hull.yaml    | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-mesh.yaml    | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-pixel.yaml   | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv2-vertex.yaml  | 24 +++++-----
 .../DXContainer/PSVv3-amplification.yaml      | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-compute.yaml | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-domain.yaml  | 24 +++++-----
 .../DXContainer/PSVv3-geometry.yaml           | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-hull.yaml    | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-mesh.yaml    | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-pixel.yaml   | 24 +++++-----
 .../ObjectYAML/DXContainer/PSVv3-vertex.yaml  | 24 +++++-----
 .../ObjectYAML/DXContainer/SigElements.yaml   |  4 +-
 llvm/unittests/Object/DXContainerTest.cpp     | 40 ++++++++--------
 40 files changed, 400 insertions(+), 282 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 013431faff2728..a4cc814549c95b 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -299,6 +299,27 @@ enum class InterpolationMode : uint8_t {
 
 ArrayRef<EnumEntry<InterpolationMode>> getInterpolationModes();
 
+#define RESOURCE_TYPE(Val, Enum) Enum = Val,
+enum class ResourceType : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<ResourceType>> getResourceTypes();
+
+#define RESOURCE_KIND(Val, Enum) Enum = Val,
+enum class ResourceKind : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<ResourceKind>> getResourceKinds();
+
+#define RESOURCE_FLAG(Val, Enum) Enum = Val,
+enum class ResourceFlag : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<ResourceFlag>> getResourceFlags();
+
 namespace v0 {
 struct RuntimeInfo {
   PipelinePSVInfo StageInfo;
@@ -315,7 +336,7 @@ struct RuntimeInfo {
 };
 
 struct ResourceBindInfo {
-  uint32_t Type;
+  ResourceType Type;
   uint32_t Space;
   uint32_t LowerBound;
   uint32_t UpperBound;
@@ -417,7 +438,7 @@ struct RuntimeInfo : public v1::RuntimeInfo {
 };
 
 struct ResourceBindInfo : public v0::ResourceBindInfo {
-  uint32_t Kind;
+  ResourceKind Kind;
   uint32_t Flags;
 
   void swapBytes() {
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 62dc573555198b..4111cecb018bb3 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -150,6 +150,52 @@ INTERPOLATION_MODE(8, Invalid)
 #undef INTERPOLATION_MODE
 #endif // INTERPOLATION_MODE
 
+#ifdef RESOURCE_TYPE
+RESOURCE_TYPE(0, Invalid)
+RESOURCE_TYPE(1, Sampler)
+RESOURCE_TYPE(2, CBV)
+RESOURCE_TYPE(3, SRVTyped)
+RESOURCE_TYPE(4, SRVRaw)
+RESOURCE_TYPE(5, SRVStructured)
+RESOURCE_TYPE(6, UAVTyped)
+RESOURCE_TYPE(7, UAVRaw)
+RESOURCE_TYPE(8, UAVStructured)
+RESOURCE_TYPE(9, UAVStructuredWithCounter)
+
+#undef RESOURCE_TYPE
+#endif // RESOURCE_TYPE
+
+#ifdef RESOURCE_KIND
+RESOURCE_KIND(0, Invalid)
+RESOURCE_KIND(1, Texture1D)
+RESOURCE_KIND(2, Texture2D)
+RESOURCE_KIND(3, Texture2DMS)
+RESOURCE_KIND(4, Texture3D)
+RESOURCE_KIND(5, TextureCube)
+RESOURCE_KIND(6, Texture1DArray)
+RESOURCE_KIND(7, Texture2DArray)
+RESOURCE_KIND(8, Texture2DMSArray)
+RESOURCE_KIND(9, TextureCubeArray)
+RESOURCE_KIND(10, TypedBuffer)
+RESOURCE_KIND(11, RawBuffer)
+RESOURCE_KIND(12, StructuredBuffer)
+RESOURCE_KIND(13, CBuffer)
+RESOURCE_KIND(14, Sampler)
+RESOURCE_KIND(15, TBuffer)
+RESOURCE_KIND(16, RTAccelerationStructure)
+RESOURCE_KIND(17, FeedbackTexture2D)
+RESOURCE_KIND(18, FeedbackTexture2DArray)
+
+#undef RESOURCE_KIND
+#endif // RESOURCE_KIND
+
+#ifdef RESOURCE_FLAG
+RESOURCE_FLAG(0, None)
+RESOURCE_FLAG(1, UsedByAtomic64)
+
+#undef RESOURCE_FLAG
+#endif // RESOURCE_FLAG
+
 #ifdef D3D_SYSTEM_VALUE
 
 D3D_SYSTEM_VALUE(0, Undefined)
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 9c4d9e19f11ba3..e432359b7bbd07 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -176,6 +176,9 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceType)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceFlag)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision)
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index 9c0e657b069697..790947cc729c0b 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -89,3 +89,33 @@ static const EnumEntry<PSV::InterpolationMode> InterpolationModeNames[] = {
 ArrayRef<EnumEntry<PSV::InterpolationMode>> PSV::getInterpolationModes() {
   return ArrayRef(InterpolationModeNames);
 }
+
+#define RESOURCE_TYPE(Val, Enum) {#Enum, PSV::ResourceType::Enum},
+
+static const EnumEntry<PSV::ResourceType> ResourceTypeNames[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<PSV::ResourceType>> PSV::getResourceTypes() {
+  return ArrayRef(ResourceTypeNames);
+}
+
+#define RESOURCE_KIND(Val, Enum) {#Enum, PSV::ResourceKind::Enum},
+
+static const EnumEntry<PSV::ResourceKind> ResourceKindNames[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<PSV::ResourceKind>> PSV::getResourceKinds() {
+  return ArrayRef(ResourceKindNames);
+}
+
+#define RESOURCE_FLAG(Val, Enum) {#Enum, PSV::ResourceFlag::Enum},
+
+static const EnumEntry<PSV::ResourceFlag> ResourceFlagNames[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<PSV::ResourceFlag>> PSV::getResourceFlags() {
+  return ArrayRef(ResourceFlagNames);
+}
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 38063670aee6e8..21a966d5abd132 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -254,6 +254,24 @@ void ScalarEnumerationTraits<dxbc::PSV::InterpolationMode>::enumeration(
     IO.enumCase(Value, E.Name.str().c_str(), E.Value);
 }
 
+void ScalarEnumerationTraits<dxbc::PSV::ResourceType>::enumeration(
+    IO &IO, dxbc::PSV::ResourceType &Value) {
+  for (const auto &E : dxbc::PSV::getResourceTypes())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::PSV::ResourceKind>::enumeration(
+    IO &IO, dxbc::PSV::ResourceKind &Value) {
+  for (const auto &E : dxbc::PSV::getResourceKinds())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::PSV::ResourceFlag>::enumeration(
+    IO &IO, dxbc::PSV::ResourceFlag &Value) {
+  for (const auto &E : dxbc::PSV::getResourceFlags())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
 void ScalarEnumerationTraits<dxbc::D3DSystemValue>::enumeration(
     IO &IO, dxbc::D3DSystemValue &Value) {
   for (const auto &E : dxbc::getD3DSystemValues())
diff --git a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
index 713fbc61e094b5..f3cfa90d1cf901 100644
--- a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
@@ -70,11 +70,11 @@ Parts:
       NumThreadsZ:     0
       ResourceStride:  24
       Resources:
-        - Type:            2
+        - Type:            CBV
           Space:           0
           LowerBound:      0
           UpperBound:      0
-          Kind:            13
+          Kind:            CBuffer
           Flags:           0
       SigInputElements:
         - Name:            AAA_HSFoo
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml
index d15bfadda41f07..3597b684fa032a 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml
@@ -19,11 +19,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -48,11 +48,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml
index 7e9f2fbd8b54de..4f8e60b780c560 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml
@@ -18,11 +18,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -46,11 +46,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml
index db2aee954b3466..fb8d148c286343 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml
@@ -21,11 +21,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -52,11 +52,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml
index 5509ac669e2d6b..cd59c6a10d1a3e 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml
@@ -22,11 +22,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -54,11 +54,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml
index cd60f2b192b2e2..a672f9260516a4 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml
@@ -22,11 +22,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -54,11 +54,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml
index 07fb656c5b72a7..07fee2ff65e5bf 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml
@@ -23,11 +23,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -56,11 +56,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml
index c7f956e5740cca..6bf18f340c3399 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml
@@ -20,11 +20,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -50,11 +50,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml
index 6df9169b73e2f5..e0690fba0e8c4c 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml
@@ -19,11 +19,11 @@ Parts:
       MaximumWaveLaneCount: 4294967295
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -48,11 +48,11 @@ Parts:
 # CHECK-NEXT: MaximumWaveLaneCount: 4294967295
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml
index 982235549cddc6..beb85ee9828207 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml
@@ -22,11 +22,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -62,11 +62,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml
index 629d45c65a2081..6c90fbb206c6af 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml
@@ -21,11 +21,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -60,11 +60,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml
index 941ec16544a2df..28a4884d1228f3 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml
@@ -25,11 +25,11 @@ Parts:
       SigOutputVectors: [ 0, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -69,11 +69,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 0, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml
index a666cc4464d457..2884fd75e73d56 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml
@@ -26,11 +26,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -70,11 +70,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml
index c0f0f41e2318bd..0e71276ad1c16f 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml
@@ -26,11 +26,11 @@ Parts:
       SigOutputVectors: [ 0, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -71,11 +71,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 0, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml
index f981cb99cb968d..1af47f95c5e723 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml
@@ -28,11 +28,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -74,11 +74,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml
index a7b6804fe5fe6c..156e83f655e4f1 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml
@@ -23,11 +23,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -64,11 +64,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml
index a9590ba7704040..020e2345c6eec2 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml
@@ -22,11 +22,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
@@ -62,11 +62,11 @@ Parts:
 # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ]
 # CHECK-NEXT: ResourceStride: 16
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
index c1ad560f463e1e..8bae742b573919 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
@@ -25,18 +25,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -72,18 +72,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
index dc0ac3af9aa34b..74eb2b86ad01b2 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
@@ -24,18 +24,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -70,18 +70,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
index 03e23b06d05e47..38f81bd93d67cf 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
@@ -28,18 +28,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -79,18 +79,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
index b4a5efd2276c52..99fdbbb7c9edaf 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
@@ -29,18 +29,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -80,18 +80,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
index a1c87343ee915b..de8af95dbcbd89 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
@@ -29,18 +29,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -81,18 +81,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
index 6155a3e2354bce..78fc077348f42a 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
@@ -31,18 +31,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -84,18 +84,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
index 3fdd7be8fe7de9..ebe1e51faff3f8 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
@@ -26,18 +26,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -74,18 +74,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
index eb77fb1b10d77a..2bca2f211136b2 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
@@ -25,18 +25,18 @@ Parts:
       NumThreadsZ:     2048
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -72,18 +72,18 @@ Parts:
 # CHECK-NEXT: NumThreadsZ:     2048
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
index 09885bd529f05f..9e31d40ec7c1b4 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
@@ -26,18 +26,18 @@ Parts:
       EntryName: ASEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -74,18 +74,18 @@ Parts:
 # CHECK-NEXT: EntryName: ASEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
index ee6fb112c77222..530a8597cb6498 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
@@ -25,18 +25,18 @@ Parts:
       EntryName: CSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -72,18 +72,18 @@ Parts:
 # CHECK-NEXT: EntryName: CSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
index dd367deae88e47..a71ab67633eb6f 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
@@ -29,18 +29,18 @@ Parts:
       EntryName: DSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -81,18 +81,18 @@ Parts:
 # CHECK-NEXT: EntryName: DSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
index 4c7680b63b02b6..db530253c6a745 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
@@ -30,18 +30,18 @@ Parts:
       EntryName: GSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -82,18 +82,18 @@ Parts:
 # CHECK-NEXT: EntryName: GSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
index 3bbad8a9b0ee62..3e3ba493e98450 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
@@ -30,18 +30,18 @@ Parts:
       EntryName: HSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -83,18 +83,18 @@ Parts:
 # CHECK-NEXT: EntryName: HSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
index c5ea1fcf078087..57bbcecfa1796b 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
@@ -32,18 +32,18 @@ Parts:
       EntryName: MSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -86,18 +86,18 @@ Parts:
 # CHECK-NEXT: EntryName: MSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
index b28d5ec8074d85..c94c234142a34b 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
@@ -27,18 +27,18 @@ Parts:
       EntryName: PSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -76,18 +76,18 @@ Parts:
 # CHECK-NEXT: EntryName: PSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
index d1fb55839931ca..697fa870c2257c 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
@@ -26,18 +26,18 @@ Parts:
       EntryName: VSEntry
       ResourceStride:       24
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-          Kind:            5
-          Flags:           6
-        - Type:            128
+          Kind:            TextureCube
+          Flags:           0
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
-          Kind:            65535
-          Flags:           16776960
+          Kind:            Invalid
+          Flags:           0
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -74,18 +74,18 @@ Parts:
 # CHECK-NEXT: EntryName: VSEntry
 # CHECK-NEXT: ResourceStride: 24
 # CHECK-NEXT: Resources:
-# CHECK-NEXT: - Type:            1
+# CHECK-NEXT: - Type:            Sampler
 # CHECK-NEXT: Space:           2
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
-# CHECK-NEXT: Kind:            5
-# CHECK-NEXT: Flags:           6
-# CHECK-NEXT: - Type:            128
+# CHECK-NEXT: Kind:            TextureCube
+# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
-# CHECK-NEXT: Kind:            65535
-# CHECK-NEXT: Flags:           16776960
+# CHECK-NEXT: Kind:            Invalid
+# CHECK-NEXT: Flags:           0
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/SigElements.yaml b/llvm/test/ObjectYAML/DXContainer/SigElements.yaml
index 47a18a6487c975..e7c72761901ebc 100644
--- a/llvm/test/ObjectYAML/DXContainer/SigElements.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/SigElements.yaml
@@ -23,11 +23,11 @@ Parts:
       SigOutputVectors: [ 8, 16, 32, 64 ]
       ResourceStride:       16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           2
           LowerBound:      3
           UpperBound:      4
-        - Type:            128
+        - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp
index 9da6543c520c74..5a2c852d6aef97 100644
--- a/llvm/unittests/Object/DXContainerTest.cpp
+++ b/llvm/unittests/Object/DXContainerTest.cpp
@@ -266,15 +266,15 @@ TEST(DXCFile, PSVResourceIterators) {
       MaximumWaveLaneCount: 4294967295
       ResourceStride:  16
       Resources:
-        - Type:            1
+        - Type:            Sampler
           Space:           1
           LowerBound:      1
           UpperBound:      1
-        - Type:            2
+        - Type:            CBV
           Space:           2
           LowerBound:      2
           UpperBound:      2
-        - Type:            3
+        - Type:            SRVTyped
           Space:           3
           LowerBound:      3
           UpperBound:      3
@@ -308,13 +308,13 @@ TEST(DXCFile, PSVResourceIterators) {
   dxbc::PSV::v2::ResourceBindInfo Binding;
 
   Binding = *It;
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Flags, 0u);
 
   ++It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 2u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::CBV);
   EXPECT_EQ(Binding.Flags, 0u);
 
   --It;
@@ -322,25 +322,25 @@ TEST(DXCFile, PSVResourceIterators) {
 
   EXPECT_TRUE(It == PSVInfo->getResources().begin());
 
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Flags, 0u);
 
   --It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Flags, 0u);
 
   ++It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 2u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::CBV);
   EXPECT_EQ(Binding.Flags, 0u);
 
   ++It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 3u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVTyped);
   EXPECT_EQ(Binding.Flags, 0u);
 
   EXPECT_FALSE(It == PSVInfo->getResources().end());
@@ -351,7 +351,7 @@ TEST(DXCFile, PSVResourceIterators) {
   EXPECT_TRUE(It == PSVInfo->getResources().end());
   EXPECT_FALSE(It != PSVInfo->getResources().end());
 
-  EXPECT_EQ(Binding.Type, 0u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid);
   EXPECT_EQ(Binding.Flags, 0u);
 
   {
@@ -361,7 +361,7 @@ TEST(DXCFile, PSVResourceIterators) {
     EXPECT_TRUE(Old == PSVInfo->getResources().end());
     EXPECT_FALSE(Old != PSVInfo->getResources().end());
 
-    EXPECT_EQ(Binding.Type, 0u);
+    EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid);
     EXPECT_EQ(Binding.Flags, 0u);
   }
 
@@ -369,7 +369,7 @@ TEST(DXCFile, PSVResourceIterators) {
 
   EXPECT_TRUE(It == PSVInfo->getResources().end());
 
-  EXPECT_EQ(Binding.Type, 0u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid);
   EXPECT_EQ(Binding.Flags, 0u);
 
   {
@@ -377,13 +377,13 @@ TEST(DXCFile, PSVResourceIterators) {
     Binding = *Old;
     EXPECT_TRUE(Old == PSVInfo->getResources().end());
 
-    EXPECT_EQ(Binding.Type, 0u);
+    EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid);
     EXPECT_EQ(Binding.Flags, 0u);
   }
 
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 3u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVTyped);
   EXPECT_EQ(Binding.Flags, 0u);
 }
 
@@ -587,7 +587,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
   dxbc::PSV::v2::ResourceBindInfo Binding;
 
   Binding = *It;
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Space, 2u);
   EXPECT_EQ(Binding.LowerBound, 3u);
   EXPECT_EQ(Binding.UpperBound, 4u);
@@ -595,7 +595,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
   ++It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 5u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVStructured);
   EXPECT_EQ(Binding.Space, 6u);
   EXPECT_EQ(Binding.LowerBound, 7u);
   EXPECT_EQ(Binding.UpperBound, 8u);
@@ -605,7 +605,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
 
   EXPECT_TRUE(It == PSVInfo->getResources().begin());
 
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Space, 2u);
   EXPECT_EQ(Binding.LowerBound, 3u);
   EXPECT_EQ(Binding.UpperBound, 4u);
@@ -613,7 +613,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
   --It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 1u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler);
   EXPECT_EQ(Binding.Space, 2u);
   EXPECT_EQ(Binding.LowerBound, 3u);
   EXPECT_EQ(Binding.UpperBound, 4u);
@@ -621,7 +621,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
   ++It;
   Binding = *It;
 
-  EXPECT_EQ(Binding.Type, 5u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVStructured);
   EXPECT_EQ(Binding.Space, 6u);
   EXPECT_EQ(Binding.LowerBound, 7u);
   EXPECT_EQ(Binding.UpperBound, 8u);;
@@ -635,7 +635,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) {
   EXPECT_TRUE(It == PSVInfo->getResources().end());
   EXPECT_FALSE(It != PSVInfo->getResources().end());
 
-  EXPECT_EQ(Binding.Type, 0u);
+  EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid);
   EXPECT_EQ(Binding.Flags, 0u);
 }
 

From 1ace91f925ad87c3e5eb836ad58fdffe60c4aea6 Mon Sep 17 00:00:00 2001
From: Job Henandez Lara <hj93@protonmail.com>
Date: Thu, 29 Aug 2024 11:14:18 -0700
Subject: [PATCH 32/72] [libc][math] Add performance tests for fmul and fmull.
 (#106262)

---
 .../BinaryOpSingleOutputPerf.h                | 50 +++++++++++--------
 .../math/performance_testing/CMakeLists.txt   | 22 ++++++++
 .../math/performance_testing/fmod_perf.cpp    |  2 +-
 .../math/performance_testing/fmodf16_perf.cpp |  4 +-
 .../math/performance_testing/fmodf_perf.cpp   |  2 +-
 .../math/performance_testing/fmul_perf.cpp    | 23 +++++++++
 .../math/performance_testing/fmull_perf.cpp   | 23 +++++++++
 .../math/performance_testing/hypot_perf.cpp   |  2 +-
 .../math/performance_testing/hypotf_perf.cpp  |  2 +-
 .../max_min_funcs_perf.cpp                    | 32 ++++++------
 .../misc_basic_ops_perf.cpp                   |  6 +--
 11 files changed, 121 insertions(+), 47 deletions(-)
 create mode 100644 libc/test/src/math/performance_testing/fmul_perf.cpp
 create mode 100644 libc/test/src/math/performance_testing/fmull_perf.cpp

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 63d9768e21899b..98a1813bd7b54a 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -16,15 +16,15 @@
 
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
-
-template <typename T> class BinaryOpSingleOutputPerf {
-  using FPBits = fputil::FPBits<T>;
+template <typename OutputType, typename InputType>
+class BinaryOpSingleOutputPerf {
+  using FPBits = fputil::FPBits<OutputType>;
   using StorageType = typename FPBits::StorageType;
   static constexpr StorageType UIntMax =
       cpp::numeric_limits<StorageType>::max();
 
 public:
-  typedef T Func(T, T);
+  typedef OutputType Func(InputType, InputType);
 
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
@@ -33,7 +33,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
       N = cpp::min(N, static_cast<size_t>(endingBit - startingBit));
 
     auto runner = [=](Func func) {
-      [[maybe_unused]] volatile T result;
+      [[maybe_unused]] volatile OutputType result;
       if (endingBit < startingBit) {
         return;
       }
@@ -42,8 +42,8 @@ template <typename T> class BinaryOpSingleOutputPerf {
       for (size_t i = 0; i < rounds; i++) {
         for (StorageType bitsX = startingBit, bitsY = endingBit;;
              bitsX += step, bitsY -= step) {
-          T x = FPBits(bitsX).get_val();
-          T y = FPBits(bitsY).get_val();
+          InputType x = FPBits(bitsX).get_val();
+          InputType y = FPBits(bitsY).get_val();
           result = func(x, y);
           if (endingBit - bitsX < step) {
             break;
@@ -94,10 +94,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
                       1'000'001, rounds, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
-    run_perf_in_range(myFunc, otherFunc,
-                      /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-                      /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(),
-                      1'000'001, rounds, log);
+    run_perf_in_range(
+        myFunc, otherFunc,
+        /* startingBit= */ FPBits(OutputType(0x1.0p-10)).uintval(),
+        /* endingBit= */ FPBits(OutputType(0x1.0p+10)).uintval(), 1'000'001,
+        rounds, log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
@@ -115,8 +116,10 @@ template <typename T> class BinaryOpSingleOutputPerf {
     log << "\n Diff tests with inputs in normal range with exponents "
            "close to each other:\n";
     diffCount += run_diff_in_range(
-        myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 10'000'001, log);
+        myFunc, otherFunc,
+        /* startingBit= */ FPBits(OutputType(0x1.0p-10)).uintval(),
+        /* endingBit= */ FPBits(OutputType(0x1.0p+10)).uintval(), 10'000'001,
+        log);
 
     log << "Total number of differing results: " << diffCount << '\n';
   }
@@ -125,18 +128,21 @@ template <typename T> class BinaryOpSingleOutputPerf {
 } // namespace testing
 } // namespace LIBC_NAMESPACE_DECL
 
-#define BINARY_OP_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)           \
+#define BINARY_OP_SINGLE_OUTPUT_PERF(OutputType, InputType, myFunc, otherFunc, \
+                                     filename)                                 \
   int main() {                                                                 \
-    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
-        &myFunc, &otherFunc, 1, filename);                                     \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<                         \
+        OutputType, InputType>::run_perf(&myFunc, &otherFunc, 1, filename);    \
     return 0;                                                                  \
   }
 
-#define BINARY_OP_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds,          \
-                                        filename)                              \
+#define BINARY_OP_SINGLE_OUTPUT_PERF_EX(OutputType, InputType, myFunc,         \
+                                        otherFunc, rounds, filename)           \
   {                                                                            \
-    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
-        &myFunc, &otherFunc, rounds, filename);                                \
-    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
-        &myFunc, &otherFunc, rounds, filename);                                \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<                         \
+        OutputType, InputType>::run_perf(&myFunc, &otherFunc, rounds,          \
+                                         filename);                            \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<                         \
+        OutputType, InputType>::run_perf(&myFunc, &otherFunc, rounds,          \
+                                         filename);                            \
   }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 8e529ca09ed797..ed1b03f3493c7d 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -476,3 +476,25 @@ add_perf_binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_perf_binary(
+  fmul_perf
+  SRCS
+    fmul_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    libc.src.math.fmul
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_perf_binary(
+  fmull_perf
+  SRCS
+    fmull_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    libc.src.math.fmull
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/fmod_perf.cpp b/libc/test/src/math/performance_testing/fmod_perf.cpp
index fa9b4c6b41287b..75a4242034226b 100644
--- a/libc/test/src/math/performance_testing/fmod_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmod_perf.cpp
@@ -12,5 +12,5 @@
 
 #include <math.h>
 
-BINARY_OP_SINGLE_OUTPUT_PERF(double, LIBC_NAMESPACE::fmod, ::fmod,
+BINARY_OP_SINGLE_OUTPUT_PERF(double, double, LIBC_NAMESPACE::fmod, ::fmod,
                              "fmod_perf.log")
diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
index ff01fa6ca5870e..062bc2da05adf9 100644
--- a/libc/test/src/math/performance_testing/fmodf16_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
@@ -16,11 +16,11 @@
 #define FMOD_FUNC(U) (LIBC_NAMESPACE::fputil::generic::FMod<float16, U>::eval)
 
 int main() {
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t),
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, FMOD_FUNC(uint16_t),
                                   FMOD_FUNC(uint32_t), 5000,
                                   "fmodf16_u16_vs_u32_perf.log")
 
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t),
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, FMOD_FUNC(uint16_t),
                                   FMOD_FUNC(uint64_t), 5000,
                                   "fmodf16_u16_vs_u64_perf.log")
   return 0;
diff --git a/libc/test/src/math/performance_testing/fmodf_perf.cpp b/libc/test/src/math/performance_testing/fmodf_perf.cpp
index f13f02e2439da3..b4f37ef25e676f 100644
--- a/libc/test/src/math/performance_testing/fmodf_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmodf_perf.cpp
@@ -12,5 +12,5 @@
 
 #include <math.h>
 
-BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::fmodf, ::fmodf,
+BINARY_OP_SINGLE_OUTPUT_PERF(float, float, LIBC_NAMESPACE::fmodf, ::fmodf,
                              "fmodf_perf.log")
diff --git a/libc/test/src/math/performance_testing/fmul_perf.cpp b/libc/test/src/math/performance_testing/fmul_perf.cpp
new file mode 100644
index 00000000000000..a215405eb6aa5d
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmul_perf.cpp
@@ -0,0 +1,23 @@
+//===-- Performance test for the fmul function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputPerf.h"
+#include "src/math/fmul.h"
+
+static constexpr size_t DOUBLE_ROUNDS = 40;
+
+float fmul_placeholder_binary(double x, double y) {
+  return static_cast<float>(x * y);
+}
+
+int main() {
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, double, LIBC_NAMESPACE::fmul,
+                                  fmul_placeholder_binary, DOUBLE_ROUNDS,
+                                  "fmul_perf.log")
+  return 0;
+}
diff --git a/libc/test/src/math/performance_testing/fmull_perf.cpp b/libc/test/src/math/performance_testing/fmull_perf.cpp
new file mode 100644
index 00000000000000..058e10288dbde8
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmull_perf.cpp
@@ -0,0 +1,23 @@
+//===-- Performance test for the fmull function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputPerf.h"
+#include "src/math/fmull.h"
+
+static constexpr size_t LONG_DOUBLE_ROUNDS = 40;
+
+float fmull_placeholder_binary(long double x, long double y) {
+  return static_cast<float>(x * y);
+}
+
+int main() {
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, long double, LIBC_NAMESPACE::fmull,
+                                  fmull_placeholder_binary, LONG_DOUBLE_ROUNDS,
+                                  "fmull_perf.log")
+  return 0;
+}
diff --git a/libc/test/src/math/performance_testing/hypot_perf.cpp b/libc/test/src/math/performance_testing/hypot_perf.cpp
index 393697b7540330..04a493ff0e0258 100644
--- a/libc/test/src/math/performance_testing/hypot_perf.cpp
+++ b/libc/test/src/math/performance_testing/hypot_perf.cpp
@@ -12,5 +12,5 @@
 
 #include <math.h>
 
-BINARY_OP_SINGLE_OUTPUT_PERF(double, LIBC_NAMESPACE::hypot, ::hypot,
+BINARY_OP_SINGLE_OUTPUT_PERF(double, double, LIBC_NAMESPACE::hypot, ::hypot,
                              "hypot_perf.log")
diff --git a/libc/test/src/math/performance_testing/hypotf_perf.cpp b/libc/test/src/math/performance_testing/hypotf_perf.cpp
index f711729377dacf..8a42f792263c98 100644
--- a/libc/test/src/math/performance_testing/hypotf_perf.cpp
+++ b/libc/test/src/math/performance_testing/hypotf_perf.cpp
@@ -12,5 +12,5 @@
 
 #include <math.h>
 
-BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::hypotf, ::hypotf,
+BINARY_OP_SINGLE_OUTPUT_PERF(float, float, LIBC_NAMESPACE::hypotf, ::hypotf,
                              "hypotf_perf.log")
diff --git a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp
index 9540112e69ea6a..b77268d107c587 100644
--- a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp
+++ b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp
@@ -35,39 +35,39 @@ float16 placeholder_binaryf16(float16 x, float16 y) { return x; }
 float placeholder_binaryf(float x, float y) { return x; }
 
 int main() {
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaxf16,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fmaxf16,
                                   placeholder_binaryf16, FLOAT16_ROUNDS,
                                   "fmaxf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminf16,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fminf16,
                                   placeholder_binaryf16, FLOAT16_ROUNDS,
                                   "fminf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximumf16,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fmaximumf16,
                                   placeholder_binaryf16, FLOAT16_ROUNDS,
                                   "fmaximumf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimumf16,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fminimumf16,
                                   placeholder_binaryf16, FLOAT16_ROUNDS,
                                   "fminimumf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximum_numf16,
-                                  placeholder_binaryf16, FLOAT16_ROUNDS,
-                                  "fmaximum_numf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimum_numf16,
-                                  placeholder_binaryf16, FLOAT16_ROUNDS,
-                                  "fminimum_numf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(
+      float16, float16, LIBC_NAMESPACE::fmaximum_numf16, placeholder_binaryf16,
+      FLOAT16_ROUNDS, "fmaximum_numf16_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(
+      float16, float16, LIBC_NAMESPACE::fminimum_numf16, placeholder_binaryf16,
+      FLOAT16_ROUNDS, "fminimum_numf16_perf.log")
 
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaxf, ::fmaxf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaxf, ::fmaxf,
                                   FLOAT_ROUNDS, "fmaxf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminf, ::fminf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminf, ::fminf,
                                   FLOAT_ROUNDS, "fminf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximumf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaximumf,
                                   placeholder_binaryf, FLOAT_ROUNDS,
                                   "fmaximumf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimumf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminimumf,
                                   placeholder_binaryf, FLOAT_ROUNDS,
                                   "fminimumf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximum_numf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaximum_numf,
                                   placeholder_binaryf, FLOAT_ROUNDS,
                                   "fmaximum_numf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimum_numf,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminimum_numf,
                                   placeholder_binaryf, FLOAT_ROUNDS,
                                   "fminimum_numf_perf.log")
 
diff --git a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
index ace1d21c62c325..9a4522c307ac76 100644
--- a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
+++ b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
@@ -28,14 +28,14 @@ int main() {
   SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16,
                                      placeholder_unaryf16, FLOAT16_ROUNDS,
                                      "fabsf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16,
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::copysignf16,
                                   placeholder_binaryf16, FLOAT16_ROUNDS,
                                   "copysignf16_perf.log")
 
   SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf,
                                      FLOAT_ROUNDS, "fabsf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf,
-                                  FLOAT_ROUNDS, "copysignf_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::copysignf,
+                                  copysignf, FLOAT_ROUNDS, "copysignf_perf.log")
 
   return 0;
 }

From 0a41c8e7a050c837c609cbcbc8342024701cd14b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 29 Aug 2024 11:27:42 -0700
Subject: [PATCH 33/72] [flang][cuda] Avoid generating cuf.data_transfer in
 OpenACC region (#106435)

`cuf.data_transfer` will be converted to runtime calls to cuda runtime
api and these are not supported in device code. assignment in OpenACC
region will be handled by the OpenACC code gen so we avoid to generate
data transfer on them.
---
 flang/lib/Lower/Bridge.cpp                   | 10 ++++-
 flang/test/Lower/CUDA/cuda-data-transfer.cuf | 44 +++++++++++++++++++-
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c48daba8cf7fab..078e17bea55859 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4380,9 +4380,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   // Check if the insertion point is currently in a device context. HostDevice
   // subprogram are not considered fully device context so it will return false
   // for it.
-  static bool isDeviceContext(fir::FirOpBuilder &builder) {
+  // If the insertion point is inside an OpenACC region op, it is considered
+  // device context.
+  static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
     if (builder.getRegion().getParentOfType<cuf::KernelOp>())
       return true;
+    if (builder.getRegion()
+            .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
+      return true;
     if (auto funcOp =
             builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
       if (auto cudaProcAttr =
@@ -4401,7 +4406,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
 
-    bool isInDeviceContext = isDeviceContext(builder);
+    bool isInDeviceContext = isCudaDeviceContext(builder);
+
     bool isCUDATransfer = (Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs) ||
                            Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs)) &&
                           !isInDeviceContext;
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 42b37fb89e4ce2..f189bf9b621082 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenacc -fcuda %s -o - | FileCheck %s
 
 ! Test CUDA Fortran data transfer using assignment statements.
 
@@ -290,3 +290,45 @@ end subroutine
 ! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
 ! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: cuf.data_transfer %[[AHOST]]#1 to %[[ADEV]]#1, %[[SHAPE]] : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>
+
+! Check that cuf.data_transfer are not generated within OpenACC region
+subroutine sub16()
+  integer, parameter :: n = 10
+  real, device :: adev(n)
+  real :: ahost(n)
+  real, managed :: b
+  integer :: i
+
+  adev = ahost
+  !$acc parallel loop deviceptr(adev) 
+  do i = 1, n
+    adev(i) = adev(i) + b
+  enddo
+
+  !$acc kernels deviceptr(adev) 
+  do i = 1, n
+    adev(i) = adev(i) + b
+  enddo
+  !$acc end kernels
+
+
+  !$acc serial deviceptr(adev) 
+  do i = 1, n
+    adev(i) = adev(i) + b
+  enddo
+  !$acc end serial
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub16()
+! CHECK: cuf.data_transfer
+! CHECK: acc.parallel
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
+
+! CHECK: acc.kernels
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
+
+! CHECK: acc.serial
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign

From 6421dcc0a978900091cc7aa8fa443746602cb442 Mon Sep 17 00:00:00 2001
From: Haopeng Liu <153236845+haopliu@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:28:49 -0700
Subject: [PATCH 34/72] [NFC] [DSE] Refactor DSE (#100956)

Refactor DSE with MemoryDefWrapper and MemoryLocationWrapper.

Normally, one MemoryDef accesses one MemoryLocation. With "initializes"
attribute, one MemoryDef (like call instruction) could initialize
multiple MemoryLocations.

Refactor DSE as a preparation to apply "initializes" attribute in DSE in
a follow-up PR
(https://github.com/llvm/llvm-project/commit/58dd8a440343055b1a4929d72317218e912c16fd).
---
 .../Scalar/DeadStoreElimination.cpp           | 367 ++++++++++--------
 1 file changed, 209 insertions(+), 158 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 992139a95a43d3..a37f295abbd31c 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -806,6 +806,34 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
   return false;
 }
 
+// A memory location wrapper that represents a MemoryLocation, `MemLoc`,
+// defined by `MemDef`.
+struct MemoryLocationWrapper {
+  MemoryLocationWrapper(MemoryLocation MemLoc, MemoryDef *MemDef)
+      : MemLoc(MemLoc), MemDef(MemDef) {
+    assert(MemLoc.Ptr && "MemLoc should be not null");
+    UnderlyingObject = getUnderlyingObject(MemLoc.Ptr);
+    DefInst = MemDef->getMemoryInst();
+  }
+
+  MemoryLocation MemLoc;
+  const Value *UnderlyingObject;
+  MemoryDef *MemDef;
+  Instruction *DefInst;
+};
+
+// A memory def wrapper that represents a MemoryDef and the MemoryLocation(s)
+// defined by this MemoryDef.
+struct MemoryDefWrapper {
+  MemoryDefWrapper(MemoryDef *MemDef, std::optional<MemoryLocation> MemLoc) {
+    DefInst = MemDef->getMemoryInst();
+    if (MemLoc.has_value())
+      DefinedLocation = MemoryLocationWrapper(*MemLoc, MemDef);
+  }
+  Instruction *DefInst;
+  std::optional<MemoryLocationWrapper> DefinedLocation = std::nullopt;
+};
+
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
@@ -1119,6 +1147,15 @@ struct DSEState {
     return MemoryLocation::getOrNone(I);
   }
 
+  std::optional<MemoryLocation> getLocForInst(Instruction *I) {
+    if (isMemTerminatorInst(I)) {
+      if (auto Loc = getLocForTerminator(I)) {
+        return Loc->first;
+      }
+    }
+    return getLocForWrite(I);
+  }
+
   /// Assuming this instruction has a dead analyzable write, can we delete
   /// this instruction?
   bool isRemovable(Instruction *I) {
@@ -2132,182 +2169,196 @@ struct DSEState {
     }
     return MadeChange;
   }
-};
 
-static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
-                                DominatorTree &DT, PostDominatorTree &PDT,
-                                const TargetLibraryInfo &TLI,
-                                const LoopInfo &LI) {
-  bool MadeChange = false;
+  // Try to eliminate dead defs that access `KillingLocWrapper.MemLoc` and are
+  // killed by `KillingLocWrapper.MemDef`. Return whether
+  // any changes were made, and whether `KillingLocWrapper.DefInst` was deleted.
+  std::pair<bool, bool>
+  eliminateDeadDefs(const MemoryLocationWrapper &KillingLocWrapper);
 
-  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
-  // For each store:
-  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
-    MemoryDef *KillingDef = State.MemDefs[I];
-    if (State.SkipStores.count(KillingDef))
+  // Try to eliminate dead defs killed by `KillingDefWrapper` and return the
+  // change state: whether make any change.
+  bool eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper);
+};
+
+std::pair<bool, bool>
+DSEState::eliminateDeadDefs(const MemoryLocationWrapper &KillingLocWrapper) {
+  bool Changed = false;
+  bool DeletedKillingLoc = false;
+  unsigned ScanLimit = MemorySSAScanLimit;
+  unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+  unsigned PartialLimit = MemorySSAPartialStoreLimit;
+  // Worklist of MemoryAccesses that may be killed by
+  // "KillingLocWrapper.MemDef".
+  SmallSetVector<MemoryAccess *, 8> ToCheck;
+  // Track MemoryAccesses that have been deleted in the loop below, so we can
+  // skip them. Don't use SkipStores for this, which may contain reused
+  // MemoryAccess addresses.
+  SmallPtrSet<MemoryAccess *, 8> Deleted;
+  [[maybe_unused]] unsigned OrigNumSkipStores = SkipStores.size();
+  ToCheck.insert(KillingLocWrapper.MemDef->getDefiningAccess());
+
+  // Check if MemoryAccesses in the worklist are killed by
+  // "KillingLocWrapper.MemDef".
+  for (unsigned I = 0; I < ToCheck.size(); I++) {
+    MemoryAccess *Current = ToCheck[I];
+    if (Deleted.contains(Current))
       continue;
-    Instruction *KillingI = KillingDef->getMemoryInst();
+    std::optional<MemoryAccess *> MaybeDeadAccess = getDomMemoryDef(
+        KillingLocWrapper.MemDef, Current, KillingLocWrapper.MemLoc,
+        KillingLocWrapper.UnderlyingObject, ScanLimit, WalkerStepLimit,
+        isMemTerminatorInst(KillingLocWrapper.DefInst), PartialLimit);
 
-    std::optional<MemoryLocation> MaybeKillingLoc;
-    if (State.isMemTerminatorInst(KillingI)) {
-      if (auto KillingLoc = State.getLocForTerminator(KillingI))
-        MaybeKillingLoc = KillingLoc->first;
-    } else {
-      MaybeKillingLoc = State.getLocForWrite(KillingI);
+    if (!MaybeDeadAccess) {
+      LLVM_DEBUG(dbgs() << "  finished walk\n");
+      continue;
     }
-
-    if (!MaybeKillingLoc) {
-      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
-                        << *KillingI << "\n");
+    MemoryAccess *DeadAccess = *MaybeDeadAccess;
+    LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess);
+    if (isa<MemoryPhi>(DeadAccess)) {
+      LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
+      for (Value *V : cast<MemoryPhi>(DeadAccess)->incoming_values()) {
+        MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
+        BasicBlock *IncomingBlock = IncomingAccess->getBlock();
+        BasicBlock *PhiBlock = DeadAccess->getBlock();
+
+        // We only consider incoming MemoryAccesses that come before the
+        // MemoryPhi. Otherwise we could discover candidates that do not
+        // strictly dominate our starting def.
+        if (PostOrderNumbers[IncomingBlock] > PostOrderNumbers[PhiBlock])
+          ToCheck.insert(IncomingAccess);
+      }
       continue;
     }
-    MemoryLocation KillingLoc = *MaybeKillingLoc;
-    assert(KillingLoc.Ptr && "KillingLoc should not be null");
-    const Value *KillingUndObj = getUnderlyingObject(KillingLoc.Ptr);
-    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
-                      << *KillingDef << " (" << *KillingI << ")\n");
-
-    unsigned ScanLimit = MemorySSAScanLimit;
-    unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
-    unsigned PartialLimit = MemorySSAPartialStoreLimit;
-    // Worklist of MemoryAccesses that may be killed by KillingDef.
-    SmallSetVector<MemoryAccess *, 8> ToCheck;
-    // Track MemoryAccesses that have been deleted in the loop below, so we can
-    // skip them. Don't use SkipStores for this, which may contain reused
-    // MemoryAccess addresses.
-    SmallPtrSet<MemoryAccess *, 8> Deleted;
-    [[maybe_unused]] unsigned OrigNumSkipStores = State.SkipStores.size();
-    ToCheck.insert(KillingDef->getDefiningAccess());
-
-    bool Shortend = false;
-    bool IsMemTerm = State.isMemTerminatorInst(KillingI);
-    // Check if MemoryAccesses in the worklist are killed by KillingDef.
-    for (unsigned I = 0; I < ToCheck.size(); I++) {
-      MemoryAccess *Current = ToCheck[I];
-      if (Deleted.contains(Current))
-        continue;
-
-      std::optional<MemoryAccess *> MaybeDeadAccess = State.getDomMemoryDef(
-          KillingDef, Current, KillingLoc, KillingUndObj, ScanLimit,
-          WalkerStepLimit, IsMemTerm, PartialLimit);
-
-      if (!MaybeDeadAccess) {
-        LLVM_DEBUG(dbgs() << "  finished walk\n");
+    MemoryDefWrapper DeadDefWrapper(
+        cast<MemoryDef>(DeadAccess),
+        getLocForInst(cast<MemoryDef>(DeadAccess)->getMemoryInst()));
+    MemoryLocationWrapper &DeadLocWrapper = *DeadDefWrapper.DefinedLocation;
+    LLVM_DEBUG(dbgs() << " (" << *DeadLocWrapper.DefInst << ")\n");
+    ToCheck.insert(DeadLocWrapper.MemDef->getDefiningAccess());
+    NumGetDomMemoryDefPassed++;
+
+    if (!DebugCounter::shouldExecute(MemorySSACounter))
+      continue;
+    if (isMemTerminatorInst(KillingLocWrapper.DefInst)) {
+      if (KillingLocWrapper.UnderlyingObject != DeadLocWrapper.UnderlyingObject)
         continue;
+      LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                        << *DeadLocWrapper.DefInst << "\n  KILLER: "
+                        << *KillingLocWrapper.DefInst << '\n');
+      deleteDeadInstruction(DeadLocWrapper.DefInst, &Deleted);
+      ++NumFastStores;
+      Changed = true;
+    } else {
+      // Check if DeadI overwrites KillingI.
+      int64_t KillingOffset = 0;
+      int64_t DeadOffset = 0;
+      OverwriteResult OR =
+          isOverwrite(KillingLocWrapper.DefInst, DeadLocWrapper.DefInst,
+                      KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc,
+                      KillingOffset, DeadOffset);
+      if (OR == OW_MaybePartial) {
+        auto Iter =
+            IOLs.insert(std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
+                DeadLocWrapper.DefInst->getParent(), InstOverlapIntervalsTy()));
+        auto &IOL = Iter.first->second;
+        OR = isPartialOverwrite(KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc,
+                                KillingOffset, DeadOffset,
+                                DeadLocWrapper.DefInst, IOL);
       }
-
-      MemoryAccess *DeadAccess = *MaybeDeadAccess;
-      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess);
-      if (isa<MemoryPhi>(DeadAccess)) {
-        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
-        for (Value *V : cast<MemoryPhi>(DeadAccess)->incoming_values()) {
-          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
-          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
-          BasicBlock *PhiBlock = DeadAccess->getBlock();
-
-          // We only consider incoming MemoryAccesses that come before the
-          // MemoryPhi. Otherwise we could discover candidates that do not
-          // strictly dominate our starting def.
-          if (State.PostOrderNumbers[IncomingBlock] >
-              State.PostOrderNumbers[PhiBlock])
-            ToCheck.insert(IncomingAccess);
+      if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
+        auto *DeadSI = dyn_cast<StoreInst>(DeadLocWrapper.DefInst);
+        auto *KillingSI = dyn_cast<StoreInst>(KillingLocWrapper.DefInst);
+        // We are re-using tryToMergePartialOverlappingStores, which requires
+        // DeadSI to dominate KillingSI.
+        // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
+        if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) {
+          if (Constant *Merged = tryToMergePartialOverlappingStores(
+                  KillingSI, DeadSI, KillingOffset, DeadOffset, DL, BatchAA,
+                  &DT)) {
+
+            // Update stored value of earlier store to merged constant.
+            DeadSI->setOperand(0, Merged);
+            ++NumModifiedStores;
+            Changed = true;
+            DeletedKillingLoc = true;
+
+            // Remove killing store and remove any outstanding overlap
+            // intervals for the updated store.
+            deleteDeadInstruction(KillingSI, &Deleted);
+            auto I = IOLs.find(DeadSI->getParent());
+            if (I != IOLs.end())
+              I->second.erase(DeadSI);
+            break;
+          }
         }
-        continue;
       }
-      auto *DeadDefAccess = cast<MemoryDef>(DeadAccess);
-      Instruction *DeadI = DeadDefAccess->getMemoryInst();
-      LLVM_DEBUG(dbgs() << " (" << *DeadI << ")\n");
-      ToCheck.insert(DeadDefAccess->getDefiningAccess());
-      NumGetDomMemoryDefPassed++;
-
-      if (!DebugCounter::shouldExecute(MemorySSACounter))
-        continue;
-
-      MemoryLocation DeadLoc = *State.getLocForWrite(DeadI);
-
-      if (IsMemTerm) {
-        const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr);
-        if (KillingUndObj != DeadUndObj)
-          continue;
-        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
-                          << "\n  KILLER: " << *KillingI << '\n');
-        State.deleteDeadInstruction(DeadI, &Deleted);
+      if (OR == OW_Complete) {
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                          << *DeadLocWrapper.DefInst << "\n  KILLER: "
+                          << *KillingLocWrapper.DefInst << '\n');
+        deleteDeadInstruction(DeadLocWrapper.DefInst, &Deleted);
         ++NumFastStores;
-        MadeChange = true;
-      } else {
-        // Check if DeadI overwrites KillingI.
-        int64_t KillingOffset = 0;
-        int64_t DeadOffset = 0;
-        OverwriteResult OR = State.isOverwrite(
-            KillingI, DeadI, KillingLoc, DeadLoc, KillingOffset, DeadOffset);
-        if (OR == OW_MaybePartial) {
-          auto Iter = State.IOLs.insert(
-              std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
-                  DeadI->getParent(), InstOverlapIntervalsTy()));
-          auto &IOL = Iter.first->second;
-          OR = isPartialOverwrite(KillingLoc, DeadLoc, KillingOffset,
-                                  DeadOffset, DeadI, IOL);
-        }
-
-        if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
-          auto *DeadSI = dyn_cast<StoreInst>(DeadI);
-          auto *KillingSI = dyn_cast<StoreInst>(KillingI);
-          // We are re-using tryToMergePartialOverlappingStores, which requires
-          // DeadSI to dominate KillingSI.
-          // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
-          if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) {
-            if (Constant *Merged = tryToMergePartialOverlappingStores(
-                    KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL,
-                    State.BatchAA, &DT)) {
-
-              // Update stored value of earlier store to merged constant.
-              DeadSI->setOperand(0, Merged);
-              ++NumModifiedStores;
-              MadeChange = true;
-
-              Shortend = true;
-              // Remove killing store and remove any outstanding overlap
-              // intervals for the updated store.
-              State.deleteDeadInstruction(KillingSI, &Deleted);
-              auto I = State.IOLs.find(DeadSI->getParent());
-              if (I != State.IOLs.end())
-                I->second.erase(DeadSI);
-              break;
-            }
-          }
-        }
-
-        if (OR == OW_Complete) {
-          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
-                            << "\n  KILLER: " << *KillingI << '\n');
-          State.deleteDeadInstruction(DeadI, &Deleted);
-          ++NumFastStores;
-          MadeChange = true;
-        }
+        Changed = true;
       }
     }
+  }
 
-    assert(State.SkipStores.size() - OrigNumSkipStores == Deleted.size() &&
-           "SkipStores and Deleted out of sync?");
+  assert(SkipStores.size() - OrigNumSkipStores == Deleted.size() &&
+         "SkipStores and Deleted out of sync?");
 
-    // Check if the store is a no-op.
-    if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) {
-      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *KillingI
-                        << '\n');
-      State.deleteDeadInstruction(KillingI);
-      NumRedundantStores++;
-      MadeChange = true;
-      continue;
-    }
+  return {Changed, DeletedKillingLoc};
+}
 
-    // Can we form a calloc from a memset/malloc pair?
-    if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) {
-      LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n"
-                        << "  DEAD: " << *KillingI << '\n');
-      State.deleteDeadInstruction(KillingI);
-      MadeChange = true;
+bool DSEState::eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper) {
+  if (!KillingDefWrapper.DefinedLocation.has_value()) {
+    LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
+                      << *KillingDefWrapper.DefInst << "\n");
+    return false;
+  }
+
+  auto &KillingLocWrapper = *KillingDefWrapper.DefinedLocation;
+  LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
+                    << *KillingLocWrapper.MemDef << " ("
+                    << *KillingLocWrapper.DefInst << ")\n");
+  auto [Changed, DeletedKillingLoc] = eliminateDeadDefs(KillingLocWrapper);
+
+  // Check if the store is a no-op.
+  if (!DeletedKillingLoc && storeIsNoop(KillingLocWrapper.MemDef,
+                                        KillingLocWrapper.UnderlyingObject)) {
+    LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: "
+                      << *KillingLocWrapper.DefInst << '\n');
+    deleteDeadInstruction(KillingLocWrapper.DefInst);
+    NumRedundantStores++;
+    return true;
+  }
+  // Can we form a calloc from a memset/malloc pair?
+  if (!DeletedKillingLoc &&
+      tryFoldIntoCalloc(KillingLocWrapper.MemDef,
+                        KillingLocWrapper.UnderlyingObject)) {
+    LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n"
+                      << "  DEAD: " << *KillingLocWrapper.DefInst << '\n');
+    deleteDeadInstruction(KillingLocWrapper.DefInst);
+    return true;
+  }
+  return Changed;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                                DominatorTree &DT, PostDominatorTree &PDT,
+                                const TargetLibraryInfo &TLI,
+                                const LoopInfo &LI) {
+  bool MadeChange = false;
+  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+  // For each store:
+  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+    MemoryDef *KillingDef = State.MemDefs[I];
+    if (State.SkipStores.count(KillingDef))
       continue;
-    }
+
+    MemoryDefWrapper KillingDefWrapper(
+        KillingDef, State.getLocForInst(KillingDef->getMemoryInst()));
+    MadeChange |= State.eliminateDeadDefs(KillingDefWrapper);
   }
 
   if (EnablePartialOverwriteTracking)

From 22ba3511087c85e3b1d4cad686f8d9c3aa6f8088 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 11:35:43 -0700
Subject: [PATCH 35/72] [RISCV][SLP] Test for <3 x Ty> reductions which require
 reordering

These tests show a vectorizable reduction where the order of the
reduction has been adjusted so that profitable vectorization requires
a reordering of the computation.   We currently have no reordering
in SLP for non-power-of-two vectors, so this doesn't work.

Note that due to reassociation performed in the standard pipeline,
this is actually the canonical form for a reduction reaching SLP.
---
 .../SLPVectorizer/RISCV/vec3-base.ll          | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 1ff286248c4a7a..c712acc4ea1893 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -554,6 +554,52 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
   ret i32 %add.1
 }
 
+; Same as above, except the reduction order has been perturbed.  This
+; is checking for our ability to reorder.
+define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
+; CHECK-LABEL: @dot_product_i32_reorder(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret i32 [[ADD_1]]
+;
+  %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
+  %l.a.0 = load i32, ptr %gep.a.0, align 4
+  %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
+  %l.a.1 = load i32, ptr %gep.a.1, align 4
+  %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
+  %l.a.2 = load i32, ptr %gep.a.2, align 4
+
+  %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
+  %l.b.0 = load i32, ptr %gep.b.0, align 4
+  %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
+  %l.b.1 = load i32, ptr %gep.b.1, align 4
+  %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
+  %l.b.2 = load i32, ptr %gep.b.2, align 4
+
+  %mul.0 = mul nsw i32 %l.a.0, %l.b.0
+  %mul.1 = mul nsw i32 %l.a.1, %l.b.1
+  %mul.2 = mul nsw i32 %l.a.2, %l.b.2
+
+  %add.0 = add i32 %mul.1, %mul.0
+  %add.1 = add i32 %add.0, %mul.2
+  ret i32 %add.1
+}
+
 define float @dot_product_fp32(ptr %a, ptr %b) {
 ; NON-POW2-LABEL: @dot_product_fp32(
 ; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
@@ -604,6 +650,50 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
   ret float %add.1
 }
 
+; Same as above, except the reduction order has been perturbed.  This
+; is checking for our ability to reorder.
+define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
+; CHECK-LABEL: @dot_product_fp32_reorder(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
+;
+  %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
+  %l.a.0 = load float, ptr %gep.a.0, align 4
+  %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
+  %l.a.1 = load float, ptr %gep.a.1, align 4
+  %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
+  %l.a.2 = load float, ptr %gep.a.2, align 4
+
+  %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
+  %l.b.0 = load float, ptr %gep.b.0, align 4
+  %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
+  %l.b.1 = load float, ptr %gep.b.1, align 4
+  %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
+  %l.b.2 = load float, ptr %gep.b.2, align 4
+
+  %mul.0 = fmul fast float %l.a.0, %l.b.0
+  %mul.1 = fmul fast float %l.a.1, %l.b.1
+  %mul.2 = fmul fast float %l.a.2, %l.b.2
+
+  %add.0 = fadd fast float %mul.1, %mul.0
+  %add.1 = fadd fast float %add.0, %mul.2
+  ret float %add.1
+}
+
+
 define double @dot_product_fp64(ptr %a, ptr %b) {
 ; NON-POW2-LABEL: @dot_product_fp64(
 ; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0

From 26b0bef192be1a3adc250af460c2e728a1ca5a64 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Thu, 29 Aug 2024 11:43:58 -0700
Subject: [PATCH 36/72] AMDGPU: Use pattern to select instruction for intrinsic
 llvm.fptrunc.round (#105761)

Use GCNPat instead of Custom Lowering to select instructions for
intrinsic llvm.fptrunc.round. "SupportedRoundMode : TImmLeaf" is used as
a predicate to select only when the rounding mode is supported.
"as_hw_round_mode : SDNodeXForm" is developed to translate the round
modes to the corresponding ones that hardware recognizes.
---
 .../Target/GlobalISel/SelectionDAGCompat.td   |   1 +
 .../include/llvm/Target/TargetSelectionDAG.td |   5 +
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   5 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 -
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 -
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  10 ++
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   3 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  33 +---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   1 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  28 +---
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 -
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  22 ++-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  11 +-
 .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll  |   7 +-
 .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 158 +++++++++---------
 16 files changed, 128 insertions(+), 161 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index e9dbdef9fe9e7c..72d155b483cf2b 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -161,6 +161,7 @@ def : GINodeEquiv<G_FFLOOR, ffloor>;
 def : GINodeEquiv<G_FRINT, frint>;
 def : GINodeEquiv<G_FNEARBYINT, fnearbyint>;
 def : GINodeEquiv<G_INTRINSIC_TRUNC, ftrunc>;
+def : GINodeEquiv<G_INTRINSIC_FPTRUNC_ROUND, fptrunc_round>;
 def : GINodeEquiv<G_INTRINSIC_ROUND, fround>;
 def : GINodeEquiv<G_INTRINSIC_ROUNDEVEN, froundeven>;
 def : GINodeEquiv<G_INTRINSIC_LRINT, lrint>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 172deffbd31771..dd79002dcbdb48 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -158,6 +158,9 @@ def SDTFPUnaryOp  : SDTypeProfile<1, 1, [   // fneg, fsqrt, etc
 def SDTFPRoundOp  : SDTypeProfile<1, 1, [   // fpround
   SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
+def SDTFPTruncRoundOp  : SDTypeProfile<1, 2, [
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
+]>;
 def SDTFPExtendOp  : SDTypeProfile<1, 1, [  // fpextend
   SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
 ]>;
@@ -552,6 +555,8 @@ def llround    : SDNode<"ISD::LLROUND"    , SDTFPToIntOp>;
 def lrint      : SDNode<"ISD::LRINT"      , SDTFPToIntOp>;
 def llrint     : SDNode<"ISD::LLRINT"     , SDTFPToIntOp>;
 
+def fptrunc_round : SDNode<"ISD::FPTRUNC_ROUND", SDTFPTruncRoundOp>;
+
 def fpround    : SDNode<"ISD::FP_ROUND"   , SDTFPRoundOp>;
 def fpextend   : SDNode<"ISD::FP_EXTEND"  , SDTFPExtendOp>;
 def fcopysign  : SDNode<"ISD::FCOPYSIGN"  , SDTFPSignOp>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 8bee84b8a87f27..118271af879937 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -297,8 +297,6 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 
-def : GINodeEquiv<G_FPTRUNC_ROUND, SIfptrunc_round>;
-
 class GISelSop2Pat <
   SDPatternOperator node,
   Instruction inst,
@@ -419,3 +417,6 @@ def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameInde
 
 def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
   GISDNodeXFormEquiv<FPPow2ToExponentXForm>;
+
+def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
+  GISDNodeXFormEquiv<as_hw_round_mode>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d24836b7eeb095..015dbc79ef9e4d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5511,7 +5511,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(LDS)
-  NODE_NAME_CASE(FPTRUNC_ROUND)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..dd9d97bd593bda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -553,7 +553,6 @@ enum NodeType : unsigned {
   CONST_DATA_PTR,
   PC_ADD_REL_OFFSET,
   LDS,
-  FPTRUNC_ROUND,
 
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 17071970ca4bfe..3fcb364fc2c536 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5594,6 +5594,16 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
   MIB.addImm(ExpVal);
 }
 
+void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI,
+                                                int OpIdx) const {
+  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
+  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
+  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
+  MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
   return TII.isInlineConstant(Imm);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 207cd67f0eda0e..068db5c1c14496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -359,6 +359,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI,
                               int OpIdx) const;
 
+  void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                       int OpIdx) const;
+
   bool isInlineImmediate(const APInt &Imm) const;
   bool isInlineImmediate(const APFloat &Imm) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4fd917f5ea7fa8..3f6486d44f0ee5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1137,7 +1137,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .lower();
 
   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
-      .customFor({S16, S32})
+      .legalFor({S16, S32})
       .scalarize(0)
       .lower();
 
@@ -2179,8 +2179,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeCTLZ_CTTZ(MI, MRI, B);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
-  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
-    return legalizeFPTruncRound(MI, B);
   case TargetOpcode::G_STACKSAVE:
     return legalizeStackSave(MI, B);
   case TargetOpcode::G_GET_FPENV:
@@ -7093,35 +7091,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
-                                               MachineIRBuilder &B) const {
-  MachineRegisterInfo &MRI = *B.getMRI();
-  Register Src = MI.getOperand(1).getReg();
-  if (MRI.getType(Src) != LLT::scalar(32))
-    return false;
-
-  // Only support towardzero, tonearest, upward and downward.
-  int RoundMode = MI.getOperand(2).getImm();
-  if (RoundMode != (int)RoundingMode::TowardZero &&
-      RoundMode != (int)RoundingMode::NearestTiesToEven &&
-      RoundMode != (int)RoundingMode::TowardPositive &&
-      RoundMode != (int)RoundingMode::TowardNegative)
-    return false;
-
-  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
-  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
-  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
-  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
-  unsigned HW_Mode = (RoundMode + 3) % 4;
-  B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
-      .addDef(MI.getOperand(0).getReg())
-      .addUse(Src)
-      .addImm(HW_Mode);
-
-  MI.eraseFromParent();
-  return true;
-}
-
 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
                                             MachineIRBuilder &B) const {
   const SITargetLowering *TLI = ST.getTargetLowering();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index db1c5874093a71..a815e87a7da35f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -212,7 +212,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
-  bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 69a1936a11fe05..4737a322c255f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5255,7 +5255,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
     break;
   }
-  case AMDGPU::G_FPTRUNC_ROUND:
+  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_PREFETCH:
     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..81b52935ddf397 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -598,7 +598,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     // F16 - VOP1 Actions.
     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
-                        ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
+                        ISD::FSIN, ISD::FROUND},
                        MVT::f16, Custom);
 
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
@@ -5797,8 +5797,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
-  case ISD::FPTRUNC_ROUND:
-    return lowerFPTRUNC_ROUND(Op, DAG);
   case ISD::TRAP:
     return lowerTRAP(Op, DAG);
   case ISD::DEBUGTRAP:
@@ -6648,30 +6646,6 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
                 DAG.getTargetConstant(0, DL, MVT::i32));
 }
 
-SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  if (Op.getOperand(0)->getValueType(0) != MVT::f32)
-    return SDValue();
-
-  // Only support towardzero, tonearest, upward and downward.
-  int RoundMode = Op.getConstantOperandVal(1);
-  if (RoundMode != (int)RoundingMode::TowardZero &&
-      RoundMode != (int)RoundingMode::NearestTiesToEven &&
-      RoundMode != (int)RoundingMode::TowardPositive &&
-      RoundMode != (int)RoundingMode::TowardNegative)
-    return SDValue();
-
-  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
-  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
-  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
-  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
-  unsigned HW_Mode = (RoundMode + 3) % 4;
-  SDLoc DL(Op);
-  SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32);
-  return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(),
-                     Op->getOperand(0), RoundFlag);
-}
-
 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f16 &&
          "Do not know how to custom lower FP_ROUND for non-f16 type");
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index eed4b3e79cdeee..1f198a92c0fa6a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -145,7 +145,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2b54429dc9a03f..faa8ca282e7ab8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -304,12 +304,6 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
 >;
 
-def SDTFPRoundModeOp  : SDTypeProfile<1, 2, [
-  SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
-]>;
-
-def SIfptrunc_round : SDNode<"AMDGPUISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
-
 //===----------------------------------------------------------------------===//
 // ValueType helpers
 //===----------------------------------------------------------------------===//
@@ -796,6 +790,22 @@ return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+def as_hw_round_mode : SDNodeXForm<timm, [{
+  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
+  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
+  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
+  return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N),
+                                    MVT::i32);
+}]>;
+
+def SupportedRoundMode : TImmLeaf<i32, [{
+  return Imm == (int)RoundingMode::TowardZero ||
+         Imm == (int)RoundingMode::NearestTiesToEven ||
+         Imm == (int)RoundingMode::TowardPositive ||
+         Imm == (int)RoundingMode::TowardNegative;
+}]>;
+
 class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
   uint64_t Imm = N->getZExtValue();
   unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 814d3182fb5df8..69e1b9a38324f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -229,10 +229,12 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
 // in the ModeRegister pass.
 let Uses = [MODE, EXEC] in {
 def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VGPR_32:$src0, i32imm:$round),
-  [(set f16:$vdst, (SIfptrunc_round f32:$src0, i32:$round))]>;
+  (ins VGPR_32:$src0, i32imm:$round)>;
 } // End Uses = [MODE, EXEC]
 
+def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
+
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
 let Defs = [SCC], isConvergent = 1 in {
@@ -4055,11 +4057,6 @@ def G_SI_CALL : AMDGPUGenericInstruction {
   let isConvergent = 1;
 }
 
-def G_FPTRUNC_ROUND : AMDGPUGenericInstruction {
-  let OutOperandList = (outs type0:$vdst);
-  let InOperandList = (ins type1:$src0, untyped_imm_0:$round);
-  let hasSideEffects = 0;
-}
 
 //============================================================================//
 // Dummy Instructions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
index 4bcd0cf5e6a0e5..f1d5b07e832c48 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
@@ -1,9 +1,8 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefixes=SDAG-FAIL
-; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=GISEL-FAIL
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
+; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
 
 define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
-; SDAG-FAIL: LLVM ERROR: Cannot select
-; GISEL-FAIL: unable to legalize instruction
+; FAIL: LLVM ERROR: Cannot select
   %res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward")
   store half %res, ptr addrspace(1) %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 71d0ee524bab73..54ed6f1eb42820 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -176,8 +176,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
   ret <2 x half> %res
@@ -197,8 +196,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
   ret <2 x half> %res
@@ -228,23 +226,18 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
-; GISEL-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
+; GISEL-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; GISEL-NEXT:    v_pk_add_f16 v0, v0, v1
-; GISEL-NEXT:    v_pk_add_f16 v0, v2, v0
+; GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
+; GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GISEL-NEXT:    global_store_dword v[4:5], v0, off
 ; GISEL-NEXT:    s_endpgm
   %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
@@ -295,31 +288,54 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
 }
 
 define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s2
-; CHECK-NEXT:    v_mov_b32_e32 v4, s1
-; CHECK-NEXT:    v_mov_b32_e32 v5, s3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v3
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v7, v5
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; CHECK-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; CHECK-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT:    v_pk_add_f16 v2, v2, v5
-; CHECK-NEXT:    v_pk_add_f16 v2, v3, v2
-; CHECK-NEXT:    global_store_dword v[0:1], v2, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; SDAG-NEXT:    v_mov_b32_e32 v5, s3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v3
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SDAG-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SDAG-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
+; SDAG-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_pk_add_f16 v2, v2, v5
+; SDAG-NEXT:    v_pk_add_f16 v2, v3, v2
+; SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GISEL-NEXT:    v_pack_b32_f16 v2, v2, v3
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v5, v6, v7
+; GISEL-NEXT:    v_pack_b32_f16 v3, v3, v4
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GISEL-NEXT:    v_pk_add_f16 v2, v2, v5
+; GISEL-NEXT:    v_pk_add_f16 v2, v3, v2
+; GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
   %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
   %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
   %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
@@ -344,8 +360,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
@@ -367,8 +382,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
@@ -391,13 +405,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
   ret <4 x half> %res
@@ -419,13 +431,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
   ret <4 x half> %res
@@ -453,21 +463,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
   ret <8 x half> %res
@@ -495,21 +501,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
   ret <8 x half> %res

From c1248c9d64e9210554571283980156b1d85cfe09 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Thu, 29 Aug 2024 21:45:46 +0300
Subject: [PATCH 37/72] [Clang] prevent assertion failure when converting
 vectors to int/float with invalid expressions (#105727)

Fixes #105486
---
 clang/docs/ReleaseNotes.rst   |  1 +
 clang/lib/Sema/SemaExpr.cpp   |  6 ++++++
 clang/test/SemaCXX/vector.cpp | 14 ++++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6df8bc64f1c7db..27f3d6e05da9f5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -326,6 +326,7 @@ Bug Fixes to C++ Support
 - Fix evaluation of the index of dependent pack indexing expressions/types specifiers (#GH105900)
 - Correctly handle subexpressions of an immediate invocation in the presence of implicit casts. (#GH105558)
 - Clang now correctly handles direct-list-initialization of a structured bindings from an array. (#GH31813)
+- Fixed an assertion failure when converting vectors to int/float with invalid expressions. (#GH105486)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 95f53dfefbcc52..de316f30e9523d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9888,6 +9888,9 @@ static ExprResult convertVector(Expr *E, QualType ElementType, Sema &S) {
 /// IntTy without losing precision.
 static bool canConvertIntToOtherIntTy(Sema &S, ExprResult *Int,
                                       QualType OtherIntTy) {
+  if (Int->get()->containsErrors())
+    return false;
+
   QualType IntTy = Int->get()->getType().getUnqualifiedType();
 
   // Reject cases where the value of the Int is unknown as that would
@@ -9926,6 +9929,9 @@ static bool canConvertIntToOtherIntTy(Sema &S, ExprResult *Int,
 /// FloatTy without losing precision.
 static bool canConvertIntTyToFloatTy(Sema &S, ExprResult *Int,
                                      QualType FloatTy) {
+  if (Int->get()->containsErrors())
+    return false;
+
   QualType IntTy = Int->get()->getType().getUnqualifiedType();
 
   // Determine if the integer constant can be expressed as a floating point
diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp
index 7c8ee89814e578..808bdb679b09cd 100644
--- a/clang/test/SemaCXX/vector.cpp
+++ b/clang/test/SemaCXX/vector.cpp
@@ -772,3 +772,17 @@ void test_scoped_enum_vector(EnumClass ea, v2u v2ua) {
 }
 #endif
 }
+
+namespace GH105486 {
+__attribute__((__vector_size__(sizeof(double)))) double a;
+double b = a - (long)(*0); // expected-error {{indirection requires pointer operand ('int' invalid)}} \
+                           // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
+
+__attribute__((__vector_size__(sizeof(long)))) long c;
+long d = c - (long)(*0); // expected-error {{indirection requires pointer operand ('int' invalid)}} \
+                         // expected-error {{cannot initialize a variable of type 'long' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
+
+const long long e = *0; // expected-error {{indirection requires pointer operand ('int' invalid)}}
+double f = a - e;       // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
+int h = c - e;          // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
+}

From e9eaf19eb605c14bed7a0f76d206c13a8eaf842f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Ferenc=20Szab=C3=B3?=
 <30732159+dfszabo@users.noreply.github.com>
Date: Thu, 29 Aug 2024 20:53:28 +0200
Subject: [PATCH 38/72] [CodeGen] Allow mixed scalar type constraints for
 inline asm (#65465)

GCC supports code like "asm volatile ("" : "=r" (i) : "0" (f))" where i
is integer type and f is floating point type. Currently this code
produces an error with Clang. The change allows mixed scalar types
between input and output constraints.

Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  8 ++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  7 ++-
 llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll | 61 +++++++++++++++++++
 3 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 521a4fee8aafe0..4b326ba76f97f2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9591,9 +9591,11 @@ static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
   std::pair<unsigned, const TargetRegisterClass *> InputRC =
       TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
                                        MatchingOpInfo.ConstraintVT);
-  if ((OpInfo.ConstraintVT.isInteger() !=
-       MatchingOpInfo.ConstraintVT.isInteger()) ||
-      (MatchRC.second != InputRC.second)) {
+  const bool OutOpIsIntOrFP =
+      OpInfo.ConstraintVT.isInteger() || OpInfo.ConstraintVT.isFloatingPoint();
+  const bool InOpIsIntOrFP = MatchingOpInfo.ConstraintVT.isInteger() ||
+                             MatchingOpInfo.ConstraintVT.isFloatingPoint();
+  if ((OutOpIsIntOrFP != InOpIsIntOrFP) || (MatchRC.second != InputRC.second)) {
     // FIXME: error out in a more elegant fashion
     report_fatal_error("Unsupported asm: input constraint"
                        " with a matching output constraint of"
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..01feec0c435edf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5856,8 +5856,11 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
         std::pair<unsigned, const TargetRegisterClass *> InputRC =
             getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
                                          Input.ConstraintVT);
-        if ((OpInfo.ConstraintVT.isInteger() !=
-             Input.ConstraintVT.isInteger()) ||
+        const bool OutOpIsIntOrFP = OpInfo.ConstraintVT.isInteger() ||
+                                    OpInfo.ConstraintVT.isFloatingPoint();
+        const bool InOpIsIntOrFP = Input.ConstraintVT.isInteger() ||
+                                   Input.ConstraintVT.isFloatingPoint();
+        if ((OutOpIsIntOrFP != InOpIsIntOrFP) ||
             (MatchRC.second != InputRC.second)) {
           report_fatal_error("Unsupported asm: input constraint"
                              " with a matching output constraint of"
diff --git a/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll b/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll
new file mode 100644
index 00000000000000..d2255d9970b123
--- /dev/null
+++ b/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr +avx < %s | FileCheck %s
+
+; The C source used as a base for generating this test:.
+
+; unsigned test(float f)
+; {
+;    unsigned i;
+;    // Copies f into the output operand i
+;    asm volatile ("" : "=r" (i) : "0" (f));
+;    return i;
+; }
+
+
+define i32 @test_int_float(float %f) {
+; CHECK-LABEL: test_int_float:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    retq
+entry:
+  %asm_call = call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(float %f)
+  ret i32 %asm_call
+}
+
+define i32 @test_int_ptr(ptr %f) {
+; CHECK-LABEL: test_int_ptr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %asm_call = call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(ptr %f)
+  ret i32 %asm_call
+}
+
+define i64 @test_int_vec(<4 x i16> %v) {
+; CHECK-LABEL: test_int_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %asm_call = call i64 asm sideeffect "", "=v,0,~{dirflag},~{fpsr},~{flags}"(<4 x i16> %v)
+  ret i64 %asm_call
+}
+
+define <4 x i32> @test_int_vec_float_vec(<4 x float> %f) {
+; CHECK-LABEL: test_int_vec_float_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    retq
+entry:
+  %asm_call = call <4 x i32> asm sideeffect "", "=v,0,~{dirflag},~{fpsr},~{flags}"(<4 x float> %f)
+  ret <4 x i32> %asm_call
+}

From ff04c5b2e69481fc3b828bfcf32e05ff7a2c4b05 Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Thu, 29 Aug 2024 12:00:28 -0700
Subject: [PATCH 39/72] [NFC][Sema] Move `Sema::AssignmentAction` into its own
 scoped enum (#106453)

The primary motivation behind this is to allow the enum type to be
referred to earlier in the Sema.h file which is needed for #106321.

It was requested in #106321 that a scoped enum be used (rather than
moving the enum declaration earlier in the Sema class declaration).
Unfortunately doing this creates a lot of churn as all use sites of the
enum constants had to be changed. Appologies to all downstream forks in
advanced.

Note the AA_ prefix has been dropped from the enum value names as they
are now redundant.
---
 clang/include/clang/Sema/Sema.h     | 31 +++++++-----
 clang/lib/Sema/SemaARM.cpp          |  5 +-
 clang/lib/Sema/SemaCast.cpp         |  4 +-
 clang/lib/Sema/SemaChecking.cpp     |  3 +-
 clang/lib/Sema/SemaDeclCXX.cpp      |  3 +-
 clang/lib/Sema/SemaExpr.cpp         | 32 ++++++------
 clang/lib/Sema/SemaExprCXX.cpp      | 47 +++++++++---------
 clang/lib/Sema/SemaInit.cpp         | 23 ++++-----
 clang/lib/Sema/SemaOpenMP.cpp       | 76 ++++++++++++++++-------------
 clang/lib/Sema/SemaOverload.cpp     | 30 +++++++-----
 clang/lib/Sema/SemaPseudoObject.cpp |  2 +-
 clang/lib/Sema/SemaStmt.cpp         |  3 +-
 12 files changed, 142 insertions(+), 117 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 821182e8356428..0358259945c796 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -204,6 +204,24 @@ class SemaPPCallbacks;
 class TemplateDeductionInfo;
 } // namespace sema
 
+// AssignmentAction - This is used by all the assignment diagnostic functions
+// to represent what is actually causing the operation
+enum class AssignmentAction {
+  Assigning,
+  Passing,
+  Returning,
+  Converting,
+  Initializing,
+  Sending,
+  Casting,
+  Passing_CFAudited
+};
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                             const AssignmentAction &AA) {
+  DB << llvm::to_underlying(AA);
+  return DB;
+}
+
 namespace threadSafety {
 class BeforeSet;
 void threadSafetyCleanup(BeforeSet *Cache);
@@ -6493,19 +6511,6 @@ class Sema final : public SemaBase {
   /// cleanup that are created by the current full expression.
   SmallVector<ExprWithCleanups::CleanupObject, 8> ExprCleanupObjects;
 
-  // AssignmentAction - This is used by all the assignment diagnostic functions
-  // to represent what is actually causing the operation
-  enum AssignmentAction {
-    AA_Assigning,
-    AA_Passing,
-    AA_Returning,
-    AA_Converting,
-    AA_Initializing,
-    AA_Sending,
-    AA_Casting,
-    AA_Passing_CFAudited
-  };
-
   /// Determine whether the use of this declaration is valid, without
   /// emitting diagnostics.
   bool CanUseDecl(NamedDecl *D, bool TreatUnavailableAsInvalid);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index e18872f0dc551e..185e0427d5c995 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -795,7 +795,8 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
     if (RHS.isInvalid())
       return true;
     if (SemaRef.DiagnoseAssignmentResult(ConvTy, Arg->getBeginLoc(), LHSTy,
-                                         RHSTy, RHS.get(), Sema::AA_Assigning))
+                                         RHSTy, RHS.get(),
+                                         AssignmentAction::Assigning))
       return true;
   }
 
@@ -921,7 +922,7 @@ bool SemaARM::CheckARMBuiltinExclusiveCall(unsigned BuiltinID,
     CastNeeded = CK_BitCast;
     Diag(DRE->getBeginLoc(), diag::ext_typecheck_convert_discards_qualifiers)
         << PointerArg->getType() << Context.getPointerType(AddrType)
-        << Sema::AA_Passing << PointerArg->getSourceRange();
+        << AssignmentAction::Passing << PointerArg->getSourceRange();
   }
 
   // Finally, do the cast and replace the argument with the corrected version.
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index eca8363ee9605c..f01b22a72915c8 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2673,7 +2673,7 @@ void CastOperation::checkAddressSpaceCast(QualType SrcType, QualType DestType) {
               ? DestPPointee.getAddressSpace() != SrcPPointee.getAddressSpace()
               : !DestPPointee.isAddressSpaceOverlapping(SrcPPointee)) {
         Self.Diag(OpRange.getBegin(), DiagID)
-            << SrcType << DestType << Sema::AA_Casting
+            << SrcType << DestType << AssignmentAction::Casting
             << SrcExpr.get()->getSourceRange();
         if (!Nested)
           SrcExpr = ExprError();
@@ -3213,7 +3213,7 @@ void CastOperation::CheckCStyleCast() {
             !CastQuals.compatiblyIncludesObjCLifetime(ExprQuals)) {
           Self.Diag(SrcExpr.get()->getBeginLoc(),
                     diag::err_typecheck_incompatible_ownership)
-              << SrcType << DestType << Sema::AA_Casting
+              << SrcType << DestType << AssignmentAction::Casting
               << SrcExpr.get()->getSourceRange();
           return;
         }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ee143381cf4f79..b021e27209cf1b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4880,7 +4880,8 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
     if (Arg->isTypeDependent())
       return false;
 
-    ExprResult Res = PerformImplicitConversion(Arg, Context.IntTy, AA_Passing);
+    ExprResult Res = PerformImplicitConversion(Arg, Context.IntTy,
+                                               AssignmentAction::Passing);
 
     if (Res.isInvalid())
       return true;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index d89a47f3e6226a..3044f1218f5b23 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -10871,7 +10871,8 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
           ExprResult This =
               ActOnCXXThis(OperatorDelete->getParamDecl(0)->getLocation());
           assert(!This.isInvalid() && "couldn't form 'this' expr in dtor?");
-          This = PerformImplicitConversion(This.get(), ParamType, AA_Passing);
+          This = PerformImplicitConversion(This.get(), ParamType,
+                                           AssignmentAction::Passing);
           if (This.isInvalid()) {
             // FIXME: Register this as a context note so that it comes out
             // in the right order.
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index de316f30e9523d..dcb08790911e74 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9586,7 +9586,7 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS,
       QualType RHSType = RHS.get()->getType();
       if (Diagnose) {
         RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
-                                        AA_Assigning);
+                                        AssignmentAction::Assigning);
       } else {
         ImplicitConversionSequence ICS =
             TryImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
@@ -9598,7 +9598,7 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS,
         if (ICS.isFailure())
           return Incompatible;
         RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
-                                        ICS, AA_Assigning);
+                                        ICS, AssignmentAction::Assigning);
       }
       if (RHS.isInvalid())
         return Incompatible;
@@ -13660,8 +13660,8 @@ QualType Sema::CheckAssignmentOperands(Expr *LHSExpr, ExprResult &RHS,
     ConvTy = CheckAssignmentConstraints(Loc, LHSType, RHSType);
   }
 
-  if (DiagnoseAssignmentResult(ConvTy, Loc, LHSType, RHSType,
-                               RHS.get(), AA_Assigning))
+  if (DiagnoseAssignmentResult(ConvTy, Loc, LHSType, RHSType, RHS.get(),
+                               AssignmentAction::Assigning))
     return QualType();
 
   CheckForNullPointerDereference(*this, LHSExpr);
@@ -16669,7 +16669,7 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
     MayHaveConvFixit = true;
     break;
   case IncompatiblePointer:
-    if (Action == AA_Passing_CFAudited) {
+    if (Action == AssignmentAction::Passing_CFAudited) {
       DiagKind = diag::err_arc_typecheck_convert_incompatible_pointer;
     } else if (getLangOpts().CPlusPlus) {
       DiagKind = diag::err_typecheck_convert_incompatible_pointer;
@@ -16823,19 +16823,19 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
 
   QualType FirstType, SecondType;
   switch (Action) {
-  case AA_Assigning:
-  case AA_Initializing:
+  case AssignmentAction::Assigning:
+  case AssignmentAction::Initializing:
     // The destination type comes first.
     FirstType = DstType;
     SecondType = SrcType;
     break;
 
-  case AA_Returning:
-  case AA_Passing:
-  case AA_Passing_CFAudited:
-  case AA_Converting:
-  case AA_Sending:
-  case AA_Casting:
+  case AssignmentAction::Returning:
+  case AssignmentAction::Passing:
+  case AssignmentAction::Passing_CFAudited:
+  case AssignmentAction::Converting:
+  case AssignmentAction::Sending:
+  case AssignmentAction::Casting:
     // The source type comes first.
     FirstType = SrcType;
     SecondType = DstType;
@@ -16844,8 +16844,8 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
 
   PartialDiagnostic FDiag = PDiag(DiagKind);
   AssignmentAction ActionForDiag = Action;
-  if (Action == AA_Passing_CFAudited)
-    ActionForDiag = AA_Passing;
+  if (Action == AssignmentAction::Passing_CFAudited)
+    ActionForDiag = AssignmentAction::Passing;
 
   FDiag << FirstType << SecondType << ActionForDiag
         << SrcExpr->getSourceRange();
@@ -16885,7 +16885,7 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
   if (CheckInferredResultType)
     ObjC().EmitRelatedResultTypeNote(SrcExpr);
 
-  if (Action == AA_Returning && ConvTy == IncompatiblePointer)
+  if (Action == AssignmentAction::Returning && ConvTy == IncompatiblePointer)
     ObjC().EmitRelatedResultTypeNoteForReturn(DstType);
 
   if (Complained)
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d8719ab26cc83f..b7531581d37ff0 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2199,8 +2199,8 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
     if (getLangOpts().CPlusPlus14) {
       assert(Context.getTargetInfo().getIntWidth() && "Builtin type of size 0?");
 
-      ConvertedSize = PerformImplicitConversion(*ArraySize, Context.getSizeType(),
-                                                AA_Converting);
+      ConvertedSize = PerformImplicitConversion(
+          *ArraySize, Context.getSizeType(), AssignmentAction::Converting);
 
       if (!ConvertedSize.isInvalid() &&
           (*ArraySize)->getType()->getAs<RecordType>())
@@ -3851,7 +3851,8 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
             Context.getQualifiedType(Pointee.getUnqualifiedType(), Qs));
         Ex = ImpCastExprToType(Ex.get(), Unqual, CK_NoOp);
       }
-      Ex = PerformImplicitConversion(Ex.get(), ParamType, AA_Passing);
+      Ex = PerformImplicitConversion(Ex.get(), ParamType,
+                                     AssignmentAction::Passing);
       if (Ex.isInvalid())
         return ExprError();
     }
@@ -4256,10 +4257,9 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
       }
       // Watch out for ellipsis conversion.
       if (!ICS.UserDefined.EllipsisConversion) {
-        ExprResult Res =
-          PerformImplicitConversion(From, BeforeToType,
-                                    ICS.UserDefined.Before, AA_Converting,
-                                    CCK);
+        ExprResult Res = PerformImplicitConversion(
+            From, BeforeToType, ICS.UserDefined.Before,
+            AssignmentAction::Converting, CCK);
         if (Res.isInvalid())
           return ExprError();
         From = Res.get();
@@ -4282,7 +4282,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
         return From;
 
       return PerformImplicitConversion(From, ToType, ICS.UserDefined.After,
-                                       AA_Converting, CCK);
+                                       AssignmentAction::Converting, CCK);
   }
 
   case ImplicitConversionSequence::AmbiguousConversion:
@@ -4451,19 +4451,19 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     //   target entity shall allow at least the exceptions allowed by the
     //   source value in the assignment or initialization.
     switch (Action) {
-    case AA_Assigning:
-    case AA_Initializing:
+    case AssignmentAction::Assigning:
+    case AssignmentAction::Initializing:
       // Note, function argument passing and returning are initialization.
-    case AA_Passing:
-    case AA_Returning:
-    case AA_Sending:
-    case AA_Passing_CFAudited:
+    case AssignmentAction::Passing:
+    case AssignmentAction::Returning:
+    case AssignmentAction::Sending:
+    case AssignmentAction::Passing_CFAudited:
       if (CheckExceptionSpecCompatibility(From, ToType))
         return ExprError();
       break;
 
-    case AA_Casting:
-    case AA_Converting:
+    case AssignmentAction::Casting:
+    case AssignmentAction::Converting:
       // Casts and implicit conversions are not initialization, so are not
       // checked for exception specification mismatches.
       break;
@@ -4577,9 +4577,10 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
 
   case ICK_Writeback_Conversion:
   case ICK_Pointer_Conversion: {
-    if (SCS.IncompatibleObjC && Action != AA_Casting) {
+    if (SCS.IncompatibleObjC && Action != AssignmentAction::Casting) {
       // Diagnose incompatible Objective-C conversions
-      if (Action == AA_Initializing || Action == AA_Assigning)
+      if (Action == AssignmentAction::Initializing ||
+          Action == AssignmentAction::Assigning)
         Diag(From->getBeginLoc(),
              diag::ext_typecheck_convert_incompatible_pointer)
             << ToType << From->getType() << Action << From->getSourceRange()
@@ -4596,12 +4597,12 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     } else if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() &&
                !ObjC().CheckObjCARCUnavailableWeakConversion(ToType,
                                                              From->getType())) {
-      if (Action == AA_Initializing)
+      if (Action == AssignmentAction::Initializing)
         Diag(From->getBeginLoc(), diag::err_arc_weak_unavailable_assign);
       else
         Diag(From->getBeginLoc(), diag::err_arc_convesion_of_weak_unavailable)
-            << (Action == AA_Casting) << From->getType() << ToType
-            << From->getSourceRange();
+            << (Action == AssignmentAction::Casting) << From->getType()
+            << ToType << From->getSourceRange();
     }
 
     // Defer address space conversion to the third conversion.
@@ -6666,14 +6667,14 @@ static bool FindConditionalOverload(Sema &Self, ExprResult &LHS, ExprResult &RHS
       // We found a match. Perform the conversions on the arguments and move on.
       ExprResult LHSRes = Self.PerformImplicitConversion(
           LHS.get(), Best->BuiltinParamTypes[0], Best->Conversions[0],
-          Sema::AA_Converting);
+          AssignmentAction::Converting);
       if (LHSRes.isInvalid())
         break;
       LHS = LHSRes;
 
       ExprResult RHSRes = Self.PerformImplicitConversion(
           RHS.get(), Best->BuiltinParamTypes[1], Best->Conversions[1],
-          Sema::AA_Converting);
+          AssignmentAction::Converting);
       if (RHSRes.isInvalid())
         break;
       RHS = RHSRes;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 5a19a3505454ca..7dc17187524621 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6799,43 +6799,44 @@ InitializationSequence::~InitializationSequence() {
 //===----------------------------------------------------------------------===//
 // Perform initialization
 //===----------------------------------------------------------------------===//
-static Sema::AssignmentAction
-getAssignmentAction(const InitializedEntity &Entity, bool Diagnose = false) {
+static AssignmentAction getAssignmentAction(const InitializedEntity &Entity,
+                                            bool Diagnose = false) {
   switch(Entity.getKind()) {
   case InitializedEntity::EK_Variable:
   case InitializedEntity::EK_New:
   case InitializedEntity::EK_Exception:
   case InitializedEntity::EK_Base:
   case InitializedEntity::EK_Delegating:
-    return Sema::AA_Initializing;
+    return AssignmentAction::Initializing;
 
   case InitializedEntity::EK_Parameter:
     if (Entity.getDecl() &&
         isa<ObjCMethodDecl>(Entity.getDecl()->getDeclContext()))
-      return Sema::AA_Sending;
+      return AssignmentAction::Sending;
 
-    return Sema::AA_Passing;
+    return AssignmentAction::Passing;
 
   case InitializedEntity::EK_Parameter_CF_Audited:
     if (Entity.getDecl() &&
       isa<ObjCMethodDecl>(Entity.getDecl()->getDeclContext()))
-      return Sema::AA_Sending;
+      return AssignmentAction::Sending;
 
-    return !Diagnose ? Sema::AA_Passing : Sema::AA_Passing_CFAudited;
+    return !Diagnose ? AssignmentAction::Passing
+                     : AssignmentAction::Passing_CFAudited;
 
   case InitializedEntity::EK_Result:
   case InitializedEntity::EK_StmtExprResult: // FIXME: Not quite right.
-    return Sema::AA_Returning;
+    return AssignmentAction::Returning;
 
   case InitializedEntity::EK_Temporary:
   case InitializedEntity::EK_RelatedResult:
     // FIXME: Can we tell apart casting vs. converting?
-    return Sema::AA_Casting;
+    return AssignmentAction::Casting;
 
   case InitializedEntity::EK_TemplateParameter:
     // This is really initialization, but refer to it as conversion for
     // consistency with CheckConvertedConstantExpression.
-    return Sema::AA_Converting;
+    return AssignmentAction::Converting;
 
   case InitializedEntity::EK_Member:
   case InitializedEntity::EK_ParenAggInitMember:
@@ -6847,7 +6848,7 @@ getAssignmentAction(const InitializedEntity &Entity, bool Diagnose = false) {
   case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
   case InitializedEntity::EK_LambdaCapture:
   case InitializedEntity::EK_CompoundLiteralInit:
-    return Sema::AA_Initializing;
+    return AssignmentAction::Initializing;
   }
 
   llvm_unreachable("Invalid EntityKind!");
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 74c646f64b42f2..23c4903ec15855 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -7395,7 +7395,8 @@ SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG,
         return std::nullopt;
       }
       VariantRefCast = SemaRef.PerformImplicitConversion(
-          VariantRef, FnPtrType.getUnqualifiedType(), Sema::AA_Converting);
+          VariantRef, FnPtrType.getUnqualifiedType(),
+          AssignmentAction::Converting);
       if (!VariantRefCast.isUsable())
         return std::nullopt;
     }
@@ -8415,9 +8416,10 @@ tryBuildCapture(Sema &SemaRef, Expr *Capture,
   if (SemaRef.CurContext->isDependentContext() || Capture->containsErrors())
     return Capture;
   if (Capture->isEvaluatable(SemaRef.Context, Expr::SE_AllowSideEffects))
-    return SemaRef.PerformImplicitConversion(
-        Capture->IgnoreImpCasts(), Capture->getType(), Sema::AA_Converting,
-        /*AllowExplicit=*/true);
+    return SemaRef.PerformImplicitConversion(Capture->IgnoreImpCasts(),
+                                             Capture->getType(),
+                                             AssignmentAction::Converting,
+                                             /*AllowExplicit=*/true);
   auto I = Captures.find(Capture);
   if (I != Captures.end())
     return buildCapture(SemaRef, Capture, I->second, Name);
@@ -8517,7 +8519,7 @@ calculateNumIters(Sema &SemaRef, Scope *S, SourceLocation DefaultLoc,
           SemaRef
               .PerformImplicitConversion(
                   SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(),
-                  CastType, Sema::AA_Converting)
+                  CastType, AssignmentAction::Converting)
               .get();
       Lower = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get();
       NewStep = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, NewStep.get());
@@ -8801,8 +8803,9 @@ Expr *OpenMPIterationSpaceChecker::buildNumIterations(
                                : Type->hasSignedIntegerRepresentation();
     Type = C.getIntTypeForBitwidth(NewSize, IsSigned);
     if (!SemaRef.Context.hasSameType(Diff.get()->getType(), Type)) {
-      Diff = SemaRef.PerformImplicitConversion(
-          Diff.get(), Type, Sema::AA_Converting, /*AllowExplicit=*/true);
+      Diff = SemaRef.PerformImplicitConversion(Diff.get(), Type,
+                                               AssignmentAction::Converting,
+                                               /*AllowExplicit=*/true);
       if (!Diff.isUsable())
         return nullptr;
     }
@@ -8820,7 +8823,8 @@ Expr *OpenMPIterationSpaceChecker::buildNumIterations(
                        C.getTypeSize(Type) < NewSize);
       if (!SemaRef.Context.hasSameType(Diff.get()->getType(), NewType)) {
         Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType,
-                                                 Sema::AA_Converting, true);
+                                                 AssignmentAction::Converting,
+                                                 /*AllowExplicit=*/true);
         if (!Diff.isUsable())
           return nullptr;
       }
@@ -8892,7 +8896,7 @@ std::pair<Expr *, Expr *> OpenMPIterationSpaceChecker::buildMinMaxValues(
           SemaRef.Context.getUnsignedPointerDiffType())) {
     Diff = SemaRef.PerformImplicitConversion(
         Diff.get(), SemaRef.Context.getUnsignedPointerDiffType(),
-        Sema::AA_Converting, /*AllowExplicit=*/true);
+        AssignmentAction::Converting, /*AllowExplicit=*/true);
   }
   if (!Diff.isUsable())
     return std::make_pair(nullptr, nullptr);
@@ -8920,7 +8924,7 @@ std::pair<Expr *, Expr *> OpenMPIterationSpaceChecker::buildMinMaxValues(
   // Convert to the original type.
   if (SemaRef.Context.hasSameType(Diff.get()->getType(), VarType))
     Diff = SemaRef.PerformImplicitConversion(Diff.get(), VarType,
-                                             Sema::AA_Converting,
+                                             AssignmentAction::Converting,
                                              /*AllowExplicit=*/true);
   if (!Diff.isUsable())
     return std::make_pair(nullptr, nullptr);
@@ -8955,7 +8959,7 @@ Expr *OpenMPIterationSpaceChecker::buildPreCond(
     return SemaRef
         .PerformImplicitConversion(
             SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get(),
-            SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
+            SemaRef.Context.BoolTy, /*Action=*/AssignmentAction::Casting,
             /*AllowExplicit=*/true)
         .get();
 
@@ -8976,7 +8980,8 @@ Expr *OpenMPIterationSpaceChecker::buildPreCond(
     if (!SemaRef.Context.hasSameUnqualifiedType(CondExpr.get()->getType(),
                                                 SemaRef.Context.BoolTy))
       CondExpr = SemaRef.PerformImplicitConversion(
-          CondExpr.get(), SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
+          CondExpr.get(), SemaRef.Context.BoolTy,
+          /*Action=*/AssignmentAction::Casting,
           /*AllowExplicit=*/true);
   }
 
@@ -9393,7 +9398,7 @@ buildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
   if (!SemaRef.Context.hasSameType(NewStart.get()->getType(),
                                    VarRef.get()->getType())) {
     NewStart = SemaRef.PerformImplicitConversion(
-        NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting,
+        NewStart.get(), VarRef.get()->getType(), AssignmentAction::Converting,
         /*AllowExplicit=*/true);
     if (!NewStart.isUsable())
       return ExprError();
@@ -9469,7 +9474,8 @@ static ExprResult buildCounterUpdate(
     if (!SemaRef.Context.hasSameType(Update.get()->getType(),
                                      VarRef.get()->getType())) {
       Update = SemaRef.PerformImplicitConversion(
-          Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true);
+          Update.get(), VarRef.get()->getType(), AssignmentAction::Converting,
+          /*AllowExplicit=*/true);
       if (!Update.isUsable())
         return ExprError();
     }
@@ -9491,8 +9497,8 @@ static ExprResult widenIterationCount(unsigned Bits, Expr *E, Sema &SemaRef) {
     return ExprResult(E);
   // OK to convert to signed, because new type has more bits than old.
   QualType NewType = C.getIntTypeForBitwidth(Bits, /*Signed=*/true);
-  return SemaRef.PerformImplicitConversion(E, NewType, Sema::AA_Converting,
-                                           true);
+  return SemaRef.PerformImplicitConversion(
+      E, NewType, AssignmentAction::Converting, /*AllowExplicit=*/true);
 }
 
 /// Check if the given expression \a E is a constant integer that fits
@@ -9752,19 +9758,19 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
   // true).
   auto PreCond = ExprResult(IterSpaces[0].PreCond);
   Expr *N0 = IterSpaces[0].NumIterations;
-  ExprResult LastIteration32 =
-      widenIterationCount(/*Bits=*/32,
-                          SemaRef
-                              .PerformImplicitConversion(
-                                  N0->IgnoreImpCasts(), N0->getType(),
-                                  Sema::AA_Converting, /*AllowExplicit=*/true)
-                              .get(),
-                          SemaRef);
+  ExprResult LastIteration32 = widenIterationCount(
+      /*Bits=*/32,
+      SemaRef
+          .PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(),
+                                     AssignmentAction::Converting,
+                                     /*AllowExplicit=*/true)
+          .get(),
+      SemaRef);
   ExprResult LastIteration64 = widenIterationCount(
       /*Bits=*/64,
       SemaRef
           .PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(),
-                                     Sema::AA_Converting,
+                                     AssignmentAction::Converting,
                                      /*AllowExplicit=*/true)
           .get(),
       SemaRef);
@@ -9790,7 +9796,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
           CurScope, Loc, BO_Mul, LastIteration32.get(),
           SemaRef
               .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
-                                         Sema::AA_Converting,
+                                         AssignmentAction::Converting,
                                          /*AllowExplicit=*/true)
               .get());
     if (LastIteration64.isUsable())
@@ -9798,7 +9804,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
           CurScope, Loc, BO_Mul, LastIteration64.get(),
           SemaRef
               .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
-                                         Sema::AA_Converting,
+                                         AssignmentAction::Converting,
                                          /*AllowExplicit=*/true)
               .get());
   }
@@ -11538,7 +11544,7 @@ bool OpenMPAtomicUpdateChecker::checkStatement(Stmt *S, unsigned DiagId,
     if (Update.isInvalid())
       return true;
     Update = SemaRef.PerformImplicitConversion(Update.get(), X->getType(),
-                                               Sema::AA_Casting);
+                                               AssignmentAction::Casting);
     if (Update.isInvalid())
       return true;
     UpdateExpr = Update.get();
@@ -15655,7 +15661,7 @@ static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc,
       break;
     }
     Res = S.PerformImplicitConversion(Res.get(), AllocatorHandleEnumTy,
-                                      Sema::AA_Initializing,
+                                      AssignmentAction::Initializing,
                                       /*AllowExplicit=*/true);
     if (!Res.isUsable()) {
       ErrorFound = true;
@@ -15686,7 +15692,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocatorClause(Expr *A,
     return nullptr;
   Allocator = SemaRef.PerformImplicitConversion(
       Allocator.get(), DSAStack->getOMPAllocatorHandleT(),
-      Sema::AA_Initializing,
+      AssignmentAction::Initializing,
       /*AllowExplicit=*/true);
   if (Allocator.isInvalid())
     return nullptr;
@@ -23096,7 +23102,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
       return nullptr;
     AllocatorRes = SemaRef.PerformImplicitConversion(
         AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(),
-        Sema::AA_Initializing,
+        AssignmentAction::Initializing,
         /*AllowExplicit=*/true);
     if (AllocatorRes.isInvalid())
       return nullptr;
@@ -23939,14 +23945,14 @@ ExprResult SemaOpenMP::ActOnOMPIteratorExpr(Scope *S,
 
     Expr *Begin = D.Range.Begin;
     if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) {
-      ExprResult BeginRes =
-          SemaRef.PerformImplicitConversion(Begin, DeclTy, Sema::AA_Converting);
+      ExprResult BeginRes = SemaRef.PerformImplicitConversion(
+          Begin, DeclTy, AssignmentAction::Converting);
       Begin = BeginRes.get();
     }
     Expr *End = D.Range.End;
     if (!IsDeclTyDependent && End && !End->isTypeDependent()) {
-      ExprResult EndRes =
-          SemaRef.PerformImplicitConversion(End, DeclTy, Sema::AA_Converting);
+      ExprResult EndRes = SemaRef.PerformImplicitConversion(
+          End, DeclTy, AssignmentAction::Converting);
       End = EndRes.get();
     }
     Expr *Step = D.Range.Step;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1ce0fa091938d7..a3c13e21c709cb 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1811,9 +1811,9 @@ ExprResult Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     return ExprError();
 
   // Objective-C ARC: Determine whether we will allow the writeback conversion.
-  bool AllowObjCWritebackConversion
-    = getLangOpts().ObjCAutoRefCount &&
-      (Action == AA_Passing || Action == AA_Sending);
+  bool AllowObjCWritebackConversion =
+      getLangOpts().ObjCAutoRefCount && (Action == AssignmentAction::Passing ||
+                                         Action == AssignmentAction::Sending);
   if (getLangOpts().ObjC)
     ObjC().CheckObjCBridgeRelatedConversions(From->getBeginLoc(), ToType,
                                              From->getType(), From);
@@ -5983,7 +5983,8 @@ ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) {
 
   ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From);
   if (!ICS.isBad())
-    return PerformImplicitConversion(From, Context.BoolTy, ICS, AA_Converting);
+    return PerformImplicitConversion(From, Context.BoolTy, ICS,
+                                     AssignmentAction::Converting);
 
   if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy))
     return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition)
@@ -6149,7 +6150,8 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From,
             T, cast<NonTypeTemplateParmDecl>(Dest)),
         SourceLocation(), From);
   } else {
-    Result = S.PerformImplicitConversion(From, T, ICS, Sema::AA_Converting);
+    Result =
+        S.PerformImplicitConversion(From, T, ICS, AssignmentAction::Converting);
   }
   if (Result.isInvalid())
     return Result;
@@ -6370,7 +6372,8 @@ ExprResult Sema::PerformContextuallyConvertToObjCPointer(Expr *From) {
   ImplicitConversionSequence ICS =
     TryContextuallyConvertToObjCPointer(*this, From);
   if (!ICS.isBad())
-    return PerformImplicitConversion(From, Ty, ICS, AA_Converting);
+    return PerformImplicitConversion(From, Ty, ICS,
+                                     AssignmentAction::Converting);
   return ExprResult();
 }
 
@@ -14363,7 +14366,8 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
       // break out so that we will build the appropriate built-in
       // operator node.
       ExprResult InputRes = PerformImplicitConversion(
-          Input, Best->BuiltinParamTypes[0], Best->Conversions[0], AA_Passing,
+          Input, Best->BuiltinParamTypes[0], Best->Conversions[0],
+          AssignmentAction::Passing,
           CheckedConversionKind::ForBuiltinOverloadedOp);
       if (InputRes.isInvalid())
         return ExprError();
@@ -14825,14 +14829,16 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
         // operator node.
         ExprResult ArgsRes0 = PerformImplicitConversion(
             Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0],
-            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
+            AssignmentAction::Passing,
+            CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes0.isInvalid())
           return ExprError();
         Args[0] = ArgsRes0.get();
 
         ExprResult ArgsRes1 = PerformImplicitConversion(
             Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1],
-            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
+            AssignmentAction::Passing,
+            CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes1.isInvalid())
           return ExprError();
         Args[1] = ArgsRes1.get();
@@ -15203,14 +15209,16 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         // operator node.
         ExprResult ArgsRes0 = PerformImplicitConversion(
             Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0],
-            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
+            AssignmentAction::Passing,
+            CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes0.isInvalid())
           return ExprError();
         Args[0] = ArgsRes0.get();
 
         ExprResult ArgsRes1 = PerformImplicitConversion(
             Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1],
-            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
+            AssignmentAction::Passing,
+            CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes1.isInvalid())
           return ExprError();
         Args[1] = ArgsRes1.get();
diff --git a/clang/lib/Sema/SemaPseudoObject.cpp b/clang/lib/Sema/SemaPseudoObject.cpp
index fdb584ceb81059..30ed47e6e56ec9 100644
--- a/clang/lib/Sema/SemaPseudoObject.cpp
+++ b/clang/lib/Sema/SemaPseudoObject.cpp
@@ -787,7 +787,7 @@ ExprResult ObjCPropertyOpBuilder::buildSet(Expr *op, SourceLocation opcLoc,
       if (opResult.isInvalid() ||
           S.DiagnoseAssignmentResult(assignResult, opcLoc, paramType,
                                      op->getType(), opResult.get(),
-                                     Sema::AA_Assigning))
+                                     AssignmentAction::Assigning))
         return ExprError();
 
       op = opResult.get();
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index ba681671eb3290..9664287b9a3fe9 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3151,7 +3151,8 @@ Sema::ActOnIndirectGotoStmt(SourceLocation GotoLoc, SourceLocation StarLoc,
     if (ExprRes.isInvalid())
       return StmtError();
     E = ExprRes.get();
-    if (DiagnoseAssignmentResult(ConvTy, StarLoc, DestTy, ETy, E, AA_Passing))
+    if (DiagnoseAssignmentResult(ConvTy, StarLoc, DestTy, ETy, E,
+                                 AssignmentAction::Passing))
       return StmtError();
   }
 

From a0441ced7a770036e00610989e2fabba5caeb31b Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 29 Aug 2024 16:01:57 -0300
Subject: [PATCH 40/72] [NFC] whitespace cleanup on
 clang/test/SemaTemplate/temp_arg_nontype.cpp

---
 clang/test/SemaTemplate/temp_arg_nontype.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp
index da42f85fb910c8..f360aa14950edd 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp
@@ -128,28 +128,28 @@ namespace ns {
   struct Foo {
     static const bool value = true;
   };
-  
+
   template <bool b>
   struct Bar {};
-  
+
   const bool value = false;
-  
+
   Bar<bool(ns::Foo<int>::value)> x;
 }
 
 // PR5349
 namespace ns {
   enum E { k };
-  
+
   template <E e>
   struct Baz  {};
-  
+
   Baz<k> f1;  // This works.
   Baz<E(0)> f2;  // This too.
   Baz<static_cast<E>(0)> f3;  // And this.
-  
+
   Baz<ns::E(0)> b1;  // This doesn't work.
-  Baz<static_cast<ns::E>(0)> b2;  // This neither.  
+  Baz<static_cast<ns::E>(0)> b2;  // This neither.
 }
 
 // PR5597
@@ -193,7 +193,7 @@ namespace EntityReferenced {
 
   template<typename T>
   struct Y {
-    static void f(T x) { 
+    static void f(T x) {
       x = 1; // expected-error{{incompatible integer to pointer conversion assigning to 'int *' from 'int'}}
     }
   };
@@ -208,7 +208,7 @@ namespace PR6964 {
   // expected-note {{template parameter is declared here}}
   struct as_nview { };
 
-  template <typename Sequence, int I0> 
+  template <typename Sequence, int I0>
   struct as_nview<Sequence, I0>  // expected-note{{while checking a default template argument used here}}
   { };
 }
@@ -235,7 +235,7 @@ namespace test8 {
     char y;
     double z;
   };
-  
+
   template <C* cp> struct B {
     C* p;
     B() : p(cp) {}

From a87105121dd300752c19024ebaf93319c2781a8b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 29 Aug 2024 14:18:37 -0500
Subject: [PATCH 41/72] [libc] Implement locale variants for 'stdlib.h'
 functions (#105718)

Summary:
This provides the `_l` variants for the `stdlib.h` functions. These are
just copies of the same entrypoint and don't do anything with the locale
information.
---
 libc/config/gpu/entrypoints.txt               |  7 ++
 libc/config/linux/x86_64/entrypoints.txt      |  9 +++
 libc/include/llvm-libc-macros/stdlib-macros.h |  5 ++
 libc/include/stdlib.h.def                     |  1 +
 libc/newhdrgen/yaml/stdlib.yaml               | 60 +++++++++++++++
 libc/spec/stdc.td                             |  8 ++
 libc/src/stdlib/CMakeLists.txt                | 77 +++++++++++++++++++
 libc/src/stdlib/strtod_l.cpp                  | 30 ++++++++
 libc/src/stdlib/strtod_l.h                    | 22 ++++++
 libc/src/stdlib/strtof_l.cpp                  | 30 ++++++++
 libc/src/stdlib/strtof_l.h                    | 22 ++++++
 libc/src/stdlib/strtol_l.cpp                  | 30 ++++++++
 libc/src/stdlib/strtol_l.h                    | 22 ++++++
 libc/src/stdlib/strtold_l.cpp                 | 30 ++++++++
 libc/src/stdlib/strtold_l.h                   | 22 ++++++
 libc/src/stdlib/strtoll_l.cpp                 | 30 ++++++++
 libc/src/stdlib/strtoll_l.h                   | 22 ++++++
 libc/src/stdlib/strtoul_l.cpp                 | 30 ++++++++
 libc/src/stdlib/strtoul_l.h                   | 22 ++++++
 libc/src/stdlib/strtoull_l.cpp                | 30 ++++++++
 libc/src/stdlib/strtoull_l.h                  | 23 ++++++
 21 files changed, 532 insertions(+)
 create mode 100644 libc/src/stdlib/strtod_l.cpp
 create mode 100644 libc/src/stdlib/strtod_l.h
 create mode 100644 libc/src/stdlib/strtof_l.cpp
 create mode 100644 libc/src/stdlib/strtof_l.h
 create mode 100644 libc/src/stdlib/strtol_l.cpp
 create mode 100644 libc/src/stdlib/strtol_l.h
 create mode 100644 libc/src/stdlib/strtold_l.cpp
 create mode 100644 libc/src/stdlib/strtold_l.h
 create mode 100644 libc/src/stdlib/strtoll_l.cpp
 create mode 100644 libc/src/stdlib/strtoll_l.h
 create mode 100644 libc/src/stdlib/strtoul_l.cpp
 create mode 100644 libc/src/stdlib/strtoul_l.h
 create mode 100644 libc/src/stdlib/strtoull_l.cpp
 create mode 100644 libc/src/stdlib/strtoull_l.h

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index db7cd24dadb7fc..d8f78f0d174534 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -173,12 +173,19 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.rand
     libc.src.stdlib.srand
     libc.src.stdlib.strtod
+    libc.src.stdlib.strtod_l
     libc.src.stdlib.strtof
+    libc.src.stdlib.strtof_l
     libc.src.stdlib.strtol
+    libc.src.stdlib.strtol_l
     libc.src.stdlib.strtold
+    libc.src.stdlib.strtold_l
     libc.src.stdlib.strtoll
+    libc.src.stdlib.strtoll_l
     libc.src.stdlib.strtoul
+    libc.src.stdlib.strtoul_l
     libc.src.stdlib.strtoull
+    libc.src.stdlib.strtoull_l
     libc.src.stdlib.at_quick_exit
     libc.src.stdlib.quick_exit
     libc.src.stdlib.getenv
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 141dc70463d64a..0aa38c7afc76f4 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -800,6 +800,15 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.ctype.tolower_l
     libc.src.ctype.toupper_l
 
+    # stdlib.h entrypoints
+    libc.src.stdlib.strtod_l
+    libc.src.stdlib.strtof_l
+    libc.src.stdlib.strtol_l
+    libc.src.stdlib.strtold_l
+    libc.src.stdlib.strtoll_l
+    libc.src.stdlib.strtoul_l
+    libc.src.stdlib.strtoull_l
+
     # assert.h entrypoints
     libc.src.assert.__assert_fail
 
diff --git a/libc/include/llvm-libc-macros/stdlib-macros.h b/libc/include/llvm-libc-macros/stdlib-macros.h
index 5fcbfef97b3285..2565c76be3c55c 100644
--- a/libc/include/llvm-libc-macros/stdlib-macros.h
+++ b/libc/include/llvm-libc-macros/stdlib-macros.h
@@ -17,6 +17,11 @@
 #define EXIT_SUCCESS 0
 #define EXIT_FAILURE 1
 
+#ifndef MB_CUR_MAX
+// We only support the "C" locale right now, so this is a constant byte.
+#define MB_CUR_MAX 1
+#endif // MB_CUR_MAX
+
 #define RAND_MAX 2147483647
 
 #endif // LLVM_LIBC_MACROS_STDLIB_MACROS_H
diff --git a/libc/include/stdlib.h.def b/libc/include/stdlib.h.def
index d523f7a53024aa..01b0e1a2395a29 100644
--- a/libc/include/stdlib.h.def
+++ b/libc/include/stdlib.h.def
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_STDLIB_H
 
 #include "__llvm-libc-common.h"
+#include "llvm-libc-types/locale_t.h"
 #include "llvm-libc-macros/stdlib-macros.h"
 
 %%public_api()
diff --git a/libc/newhdrgen/yaml/stdlib.yaml b/libc/newhdrgen/yaml/stdlib.yaml
index 081da5391c3a52..5da49b8a89101c 100644
--- a/libc/newhdrgen/yaml/stdlib.yaml
+++ b/libc/newhdrgen/yaml/stdlib.yaml
@@ -273,3 +273,63 @@ functions:
       - type: const char *__restrict
       - type: char **__restrict
       - type: int
+  - name: strtod_l
+    standards:
+      - stdc
+    return_type: double
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: locale_t
+  - name: strtof_l
+    standards:
+      - stdc
+    return_type: float
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: locale_t
+  - name: strtol_l
+    standards:
+      - stdc
+    return_type: long
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: int
+      - type: locale_t
+  - name: strtold_l
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: locale_t
+  - name: strtoll_l
+    standards:
+      - stdc
+    return_type: long long
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: int
+      - type: locale_t
+  - name: strtoul_l
+    standards:
+      - stdc
+    return_type: unsigned long
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: int
+      - type: locale_t
+  - name: strtoull_l
+    standards:
+      - stdc
+    return_type: unsigned long long
+    arguments:
+      - type: const char *__restrict
+      - type: char **__restrict
+      - type: int
+      - type: locale_t
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 026cc72b458a77..2c61cb9d952951 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -1308,6 +1308,14 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
           FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
 
+          FunctionSpec<"strtof", RetValSpec<FloatType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtod", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtold", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtol", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+
           FunctionSpec<"malloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>]>,
           FunctionSpec<"calloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>, ArgSpec<SizeTType>]>,
           FunctionSpec<"realloc", RetValSpec<VoidPtr>, [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]>,
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index ce12e66cf3e57f..7fc68cb35e8489 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -428,6 +428,83 @@ if(NOT LLVM_LIBC_FULL_BUILD)
   return()
 endif()
 
+add_entrypoint_object(
+  strtof_l
+  SRCS
+    strtof_l.cpp
+  HDRS
+    strtof_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_float
+)
+
+add_entrypoint_object(
+  strtod_l
+  SRCS
+    strtod_l.cpp
+  HDRS
+    strtod_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_float
+)
+
+add_entrypoint_object(
+  strtold_l
+  SRCS
+    strtold_l.cpp
+  HDRS
+    strtold_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_float
+)
+
+add_entrypoint_object(
+  strtol_l
+  SRCS
+    strtol_l.cpp
+  HDRS
+    strtol_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_integer
+)
+
+add_entrypoint_object(
+  strtoll_l
+  SRCS
+    strtoll_l.cpp
+  HDRS
+    strtoll_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_integer
+)
+
+add_entrypoint_object(
+  strtoul_l
+  SRCS
+    strtoul_l.cpp
+  HDRS
+    strtoul_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_integer
+)
+
+add_entrypoint_object(
+  strtoull_l
+  SRCS
+    strtoull_l.cpp
+  HDRS
+    strtoull_l.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.str_to_integer
+)
+
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp
new file mode 100644
index 00000000000000..247314398315b0
--- /dev/null
+++ b/libc/src/stdlib/strtod_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtod_l ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtod_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, strtod_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    locale_t)) {
+  auto result = internal::strtofloatingpoint<double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtod_l.h b/libc/src/stdlib/strtod_l.h
new file mode 100644
index 00000000000000..06a8c893af2896
--- /dev/null
+++ b/libc/src/stdlib/strtod_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtod_l ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOD_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOD_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double strtod_l(const char *__restrict str, char **__restrict str_end,
+                locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOD_L_H
diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp
new file mode 100644
index 00000000000000..d54efa66e0846c
--- /dev/null
+++ b/libc/src/stdlib/strtof_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtof_l ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtof_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, strtof_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    locale_t)) {
+  auto result = internal::strtofloatingpoint<float>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtof_l.h b/libc/src/stdlib/strtof_l.h
new file mode 100644
index 00000000000000..de629e3f36d458
--- /dev/null
+++ b/libc/src/stdlib/strtof_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtof_l ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOF_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOF_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float strtof_l(const char *__restrict str, char **__restrict str_end,
+               locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOF_L_H
diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp
new file mode 100644
index 00000000000000..f94aff1a0d7b2a
--- /dev/null
+++ b/libc/src/stdlib/strtol_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtol_l ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtol_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_integer.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long, strtol_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base, locale_t)) {
+  auto result = internal::strtointeger<long>(str, base);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtol_l.h b/libc/src/stdlib/strtol_l.h
new file mode 100644
index 00000000000000..9f8c8553654d78
--- /dev/null
+++ b/libc/src/stdlib/strtol_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtol_l ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOL_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOL_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long strtol_l(const char *__restrict str, char **__restrict str_end, int base,
+              locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOL_L_H
diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp
new file mode 100644
index 00000000000000..d0c57f50246b5c
--- /dev/null
+++ b/libc/src/stdlib/strtold_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtold_l ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtold_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, strtold_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    locale_t)) {
+  auto result = internal::strtofloatingpoint<long double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtold_l.h b/libc/src/stdlib/strtold_l.h
new file mode 100644
index 00000000000000..d694ce279b6e39
--- /dev/null
+++ b/libc/src/stdlib/strtold_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtold_l ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double strtold_l(const char *__restrict str, char **__restrict str_end,
+                      locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H
diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp
new file mode 100644
index 00000000000000..e82971d59c48d3
--- /dev/null
+++ b/libc/src/stdlib/strtoll_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtoll_l ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoll_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_integer.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long long, strtoll_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base, locale_t)) {
+  auto result = internal::strtointeger<long long>(str, base);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtoll_l.h b/libc/src/stdlib/strtoll_l.h
new file mode 100644
index 00000000000000..461fedb3df485d
--- /dev/null
+++ b/libc/src/stdlib/strtoll_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtoll_l ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long long strtoll_l(const char *__restrict str, char **__restrict str_end,
+                    int base, locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H
diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp
new file mode 100644
index 00000000000000..74fce00a0ac3c4
--- /dev/null
+++ b/libc/src/stdlib/strtoul_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtoul_l ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoul_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_integer.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(unsigned long, strtoul_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base, locale_t)) {
+  auto result = internal::strtointeger<unsigned long>(str, base);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtoul_l.h b/libc/src/stdlib/strtoul_l.h
new file mode 100644
index 00000000000000..7c9f53a8acb31c
--- /dev/null
+++ b/libc/src/stdlib/strtoul_l.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for strtoul_l ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+unsigned long strtoul_l(const char *__restrict str, char **__restrict str_end,
+                        int base, locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H
diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp
new file mode 100644
index 00000000000000..2ea8a43a40ef2a
--- /dev/null
+++ b/libc/src/stdlib/strtoull_l.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of strtoull_l --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoull_l.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_integer.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(unsigned long long, strtoull_l,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base, locale_t)) {
+  auto result = internal::strtointeger<unsigned long long>(str, base);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<char *>(str + result.parsed_len);
+
+  return result;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strtoull_l.h b/libc/src/stdlib/strtoull_l.h
new file mode 100644
index 00000000000000..c40f83ed1ffff2
--- /dev/null
+++ b/libc/src/stdlib/strtoull_l.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for strtoull_l --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+unsigned long long strtoull_l(const char *__restrict str,
+                              char **__restrict str_end, int base,
+                              locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H

From 5c019bdb7a008cf6465972d4affd8b2802465722 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 29 Aug 2024 14:20:15 -0500
Subject: [PATCH 42/72] [libc] Add support for 'string.h' locale variants
 (#105719)

Summary:
This adds the locale variants of the string functions. As previously,
these do not use the locale information at all and simply copy the
non-locale version which expects the "C" locale.
---
 libc/config/gpu/entrypoints.txt          |  2 ++
 libc/config/linux/x86_64/entrypoints.txt |  4 ++++
 libc/include/string.h.def                |  1 +
 libc/newhdrgen/yaml/string.yaml          | 17 ++++++++++++++
 libc/spec/stdc.td                        | 13 +++++++++++
 libc/src/string/CMakeLists.txt           | 19 ++++++++++++++++
 libc/src/string/strcoll_l.cpp            | 24 ++++++++++++++++++++
 libc/src/string/strcoll_l.h              | 21 ++++++++++++++++++
 libc/src/string/strxfrm_l.cpp            | 28 ++++++++++++++++++++++++
 libc/src/string/strxfrm_l.h              | 23 +++++++++++++++++++
 10 files changed, 152 insertions(+)
 create mode 100644 libc/src/string/strcoll_l.cpp
 create mode 100644 libc/src/string/strcoll_l.h
 create mode 100644 libc/src/string/strxfrm_l.cpp
 create mode 100644 libc/src/string/strxfrm_l.h

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index d8f78f0d174534..706f603b6ff56f 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -58,6 +58,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.strchrnul
     libc.src.string.strcmp
     libc.src.string.strcoll
+    libc.src.string.strcoll_l
     libc.src.string.strcpy
     libc.src.string.strcspn
     libc.src.string.strdup
@@ -79,6 +80,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.string.strtok
     libc.src.string.strtok_r
     libc.src.string.strxfrm
+    libc.src.string.strxfrm_l
 
     # stdbit.h entrypoints
     libc.src.stdbit.stdc_bit_ceil_uc
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 0aa38c7afc76f4..3fd88fc0020e55 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -809,6 +809,10 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib.strtoul_l
     libc.src.stdlib.strtoull_l
 
+    # string.h entrypoints
+    libc.src.string.strcoll_l
+    libc.src.string.strxfrm_l
+
     # assert.h entrypoints
     libc.src.assert.__assert_fail
 
diff --git a/libc/include/string.h.def b/libc/include/string.h.def
index 1bd2687db2beac..e180f0d2561d3a 100644
--- a/libc/include/string.h.def
+++ b/libc/include/string.h.def
@@ -11,6 +11,7 @@
 
 #include "__llvm-libc-common.h"
 
+#include "llvm-libc-types/locale_t.h"
 #include "llvm-libc-macros/null-macro.h"
 
 %%public_api()
diff --git a/libc/newhdrgen/yaml/string.yaml b/libc/newhdrgen/yaml/string.yaml
index 1d6e64bfb9cf60..af1750e91243ea 100644
--- a/libc/newhdrgen/yaml/string.yaml
+++ b/libc/newhdrgen/yaml/string.yaml
@@ -144,6 +144,14 @@ functions:
     arguments:
       - type: const char *
       - type: const char *
+  - name: strcoll_l
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: const char *
+      - type: locale_t
   - name: strcpy
     standards:
       - stdc
@@ -300,3 +308,12 @@ functions:
       - type: char *__restrict
       - type: const char *__restrict
       - type: size_t
+  - name: strxfrm_l
+    standards:
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: const char *__restrict
+      - type: size_t
+      - type: locale_t
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 2c61cb9d952951..1742e1f7b0ef33 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -354,6 +354,11 @@ def StdC : StandardSpec<"stdc"> {
               RetValSpec<IntType>,
               [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
           >,
+          FunctionSpec<
+              "strcoll_l",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<LocaleT>]
+          >,
           FunctionSpec<
               "strncmp",
               RetValSpec<IntType>,
@@ -366,6 +371,14 @@ def StdC : StandardSpec<"stdc"> {
                ArgSpec<ConstCharRestrictedPtr>,
                ArgSpec<SizeTType>]
           >,
+          FunctionSpec<
+              "strxfrm_l",
+              RetValSpec<SizeTType>,
+              [ArgSpec<CharRestrictedPtr>,
+               ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<SizeTType>,
+               ArgSpec<LocaleT>]
+          >,
           FunctionSpec<
               "strchr",
               RetValSpec<CharPtr>,
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 56588ffafb86f0..787188ab3beb91 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -200,6 +200,14 @@ add_entrypoint_object(
     strcoll.h
 )
 
+add_entrypoint_object(
+  strcoll_l
+  SRCS
+    strcoll_l.cpp
+  HDRS
+  strcoll_l.h
+)
+
 add_entrypoint_object(
   strcpy
   SRCS
@@ -441,6 +449,17 @@ add_entrypoint_object(
     .memory_utils.inline_memcpy
 )
 
+add_entrypoint_object(
+  strxfrm_l
+  SRCS
+    strxfrm_l.cpp
+  HDRS
+    strxfrm_l.h
+  DEPENDS
+    .string_utils
+    .memory_utils.inline_memcpy
+)
+
 add_entrypoint_object(
   memset_explicit
   SRCS
diff --git a/libc/src/string/strcoll_l.cpp b/libc/src/string/strcoll_l.cpp
new file mode 100644
index 00000000000000..f664a3c7c03f37
--- /dev/null
+++ b/libc/src/string/strcoll_l.cpp
@@ -0,0 +1,24 @@
+//===-- Implementation of strcoll_l ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strcoll_l.h"
+
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: Add support for locales.
+LLVM_LIBC_FUNCTION(int, strcoll_l,
+                   (const char *left, const char *right, locale_t)) {
+  for (; *left && *left == *right; ++left, ++right)
+    ;
+  return static_cast<int>(*left) - static_cast<int>(*right);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/strcoll_l.h b/libc/src/string/strcoll_l.h
new file mode 100644
index 00000000000000..97230fb811236c
--- /dev/null
+++ b/libc/src/string/strcoll_l.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for strcoll_l ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_STRCOLL_L_H
+#define LLVM_LIBC_SRC_STRING_STRCOLL_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int strcoll_l(const char *left, const char *right, locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_STRCOLL_L_H
diff --git a/libc/src/string/strxfrm_l.cpp b/libc/src/string/strxfrm_l.cpp
new file mode 100644
index 00000000000000..ae758e1fcba6d8
--- /dev/null
+++ b/libc/src/string/strxfrm_l.cpp
@@ -0,0 +1,28 @@
+//===-- Implementation of strxfrm_l ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strxfrm_l.h"
+#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+#include "src/string/string_utils.h"
+
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: Add support for locales.
+LLVM_LIBC_FUNCTION(size_t, strxfrm_l,
+                   (char *__restrict dest, const char *__restrict src, size_t n,
+                    locale_t)) {
+  size_t len = internal::string_length(src);
+  if (n > len)
+    inline_memcpy(dest, src, len + 1);
+  return len;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/strxfrm_l.h b/libc/src/string/strxfrm_l.h
new file mode 100644
index 00000000000000..af0f181601184b
--- /dev/null
+++ b/libc/src/string/strxfrm_l.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for strxfrm_l ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_STRXFRM_L_H
+#define LLVM_LIBC_SRC_STRING_STRXFRM_L_H
+
+#include "include/llvm-libc-types/locale_t.h"
+#include "src/__support/macros/config.h"
+#include <stddef.h> // For size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t strxfrm_l(char *__restrict dest, const char *__restrict src, size_t n,
+                 locale_t locale);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_STRXFRM_L_H

From ba5e8fcecea20da0a796b85e20d6292eb1447b6c Mon Sep 17 00:00:00 2001
From: Kelvin Li <kkwli@users.noreply.github.com>
Date: Thu, 29 Aug 2024 15:21:06 -0400
Subject: [PATCH 43/72] [flang] Adjust execute_command_line intrinsic return
 values for AIX (NFC) (#106472)

---
 flang/unittests/Runtime/CommandTest.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 20bd7a5b5ff35a..b0c43ba01d8f33 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -348,10 +348,14 @@ TEST_F(ZeroArguments, ECLGeneralErrorCommandErrorSync) {
 
   RTNAME(ExecuteCommandLine)
   (*command.get(), wait, exitStat.get(), cmdStat.get(), cmdMsg.get());
-#ifdef _WIN32
+#if defined(_WIN32)
   CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
   CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
   CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX");
+#elif defined(_AIX)
+  CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 2);
+  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
+  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX");
 #else
   CheckDescriptorEqInt<std::int64_t>(exitStat.get(), 1);
   CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 3);

From 74938ab84dbfdedc6af7a276ebd67201b5eb78e5 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 29 Aug 2024 15:21:25 -0400
Subject: [PATCH 44/72] [AMDGPU][True16][MC] add true16/fake16 flag to gfx12
 dasm tests (#106469)

add true16/fake16 flag to gfx12 dasm tests including vop1, vop1_dpp,
vop3_from_vop1 and vop3_from_vop1_dpp. This is a test only change.
---
 .../AMDGPU/gfx12_dasm_vop1_dpp16.txt          | 356 +++++++++++++-----
 .../AMDGPU/gfx12_dasm_vop1_dpp8.txt           | 104 ++++-
 .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt      | 328 ++++++++++------
 .../gfx12_dasm_vop3_from_vop1_dpp16.txt       | 297 ++++++++++-----
 .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt |  87 +++--
 5 files changed, 817 insertions(+), 355 deletions(-)

diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index ac45962e1743e4..b4aff84eeb69a4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -1,5 +1,7 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 # GFX12: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff
@@ -43,48 +45,70 @@
 # GFX12: v_bfrev_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_ceil_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_ceil_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_ceil_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_ceil_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_ceil_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_ceil_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_ceil_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -1231,48 +1255,70 @@
 # GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
 
-# GFX12: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_exp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_exp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_exp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_exp_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_exp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_exp_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xb0,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_exp_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xb0,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_exp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -1315,48 +1361,70 @@
 # GFX12: v_exp_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
-# GFX12: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_floor_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_floor_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_floor_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_floor_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_floor_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_floor_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_floor_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
+0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_floor_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -1651,48 +1719,70 @@
 # GFX12: v_frexp_mant_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
-# GFX12: v_log_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_log_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_log_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_log_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_log_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_log_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_log_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xae,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_log_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xae,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_log_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -2029,48 +2119,70 @@
 # GFX12: v_not_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_rcp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_rcp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_rcp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rcp_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rcp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_rcp_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xa8,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_rcp_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xa8,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_rcp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -2239,48 +2351,70 @@
 # GFX12: v_rndne_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
-# GFX12: v_rsq_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_rsq_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_rsq_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rsq_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rsq_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_rsq_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xac,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_rsq_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xac,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_rsq_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
@@ -2449,48 +2583,70 @@
 # GFX12: v_sin_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
-# GFX12: v_sqrt_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sqrt_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13]
 0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13
 
-# GFX12: v_sqrt_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_sqrt_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_sqrt_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
 0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
 
+# GFX12-REAL16: v_sqrt_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xaa,0x0a,0x7f,0x81,0x60,0x01,0x13
+
+# GFX12-REAL16: v_sqrt_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xfa,0xaa,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+
 # GFX12: v_sqrt_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 957c425008c872..04650eaec1180d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -1,5 +1,7 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 # GFX12: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05
@@ -7,12 +9,22 @@
 # GFX12: v_bfrev_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_ceil_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_ceil_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_ceil_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_ceil_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05
 
@@ -187,24 +199,44 @@
 # GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_exp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_exp_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xb0,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_exp_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xb0,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_exp_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05
 
 # GFX12: v_exp_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_floor_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_floor_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_floor_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_floor_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05
 
@@ -247,12 +279,22 @@
 # GFX12: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_log_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_log_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_log_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_log_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xae,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_log_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xae,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_log_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05
 
@@ -301,12 +343,22 @@
 # GFX12: v_not_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_rcp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_rcp_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xa8,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_rcp_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xa8,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_rcp_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05
 
@@ -331,12 +383,22 @@
 # GFX12: v_rndne_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_rsq_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_rsq_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xac,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_rsq_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xac,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_rsq_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05
 
@@ -361,12 +423,22 @@
 # GFX12: v_sin_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX12: v_sqrt_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00
 
+# GFX12-REAL16: v_sqrt_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xe9,0xaa,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX12-REAL16: v_sqrt_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX12-FAKE16: warning: invalid instruction encoding
+0xea,0xaa,0xfe,0x7f,0xff,0x00,0x00,0x00
+
 # GFX12: v_sqrt_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index 5c7cc3a8e223e6..c0aab0692bbb5b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 # GFX12: v_bfrev_b32_e64 v5, v1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00
@@ -45,49 +46,64 @@
 # GFX12: v_bfrev_b32_e64 v255, 0xaf123456        ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_ceil_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_ceil_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00]
@@ -513,7 +529,7 @@
 # GFX12: v_cvt_f16_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_cvt_f16_i16_e64 v5, 0x3800 mul:2
+# GFX12: v_cvt_f16_i16_e64 v5, 0x3800 mul:2      ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
 0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08
 
 # GFX12: v_cvt_f16_i16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
@@ -558,7 +574,7 @@
 # GFX12: v_cvt_f16_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_cvt_f16_u16_e64 v5, 0x3800 mul:2
+# GFX12: v_cvt_f16_u16_e64 v5, 0x3800 mul:2      ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
 0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08
 
 # GFX12: v_cvt_f16_u16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
@@ -1260,7 +1276,7 @@
 # GFX12: v_cvt_i32_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_cvt_i32_i16_e64 v5, 0x3800
+# GFX12: v_cvt_i32_i16_e64 v5, 0x3800            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00
 
 # GFX12: v_cvt_i32_i16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00]
@@ -1611,7 +1627,7 @@
 # GFX12: v_cvt_u32_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_cvt_u32_u16_e64 v5, 0x3800
+# GFX12: v_cvt_u32_u16_e64 v5, 0x3800            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00
 
 # GFX12: v_cvt_u32_u16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00]
@@ -1620,49 +1636,64 @@
 # GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_exp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_exp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00]
@@ -1710,49 +1741,64 @@
 # GFX12: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
 
-# GFX12: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_floor_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_floor_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00]
@@ -2214,49 +2260,64 @@
 # GFX12: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
 
-# GFX12: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_log_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_log_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_log_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00]
@@ -2448,7 +2509,7 @@
 # GFX12: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_not_b16_e64 v5, 0x3800
+# GFX12: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00
 
 # GFX12: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
@@ -2502,49 +2563,64 @@
 # GFX12: v_not_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rcp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_rcp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00]
@@ -2799,49 +2875,64 @@
 # GFX12: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
 
-# GFX12: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rsq_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_rsq_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00]
@@ -3060,49 +3151,64 @@
 # GFX12: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
 
-# GFX12: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
 0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
 0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08
 
-# GFX12: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_sqrt_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
 0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10
 
-# GFX12: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_sqrt_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index f9c768e3e02665..95cfad146fc163 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 # GFX12: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
@@ -42,46 +43,60 @@
 # GFX12: v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1260,46 +1275,60 @@
 # GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_exp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_exp_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1344,46 +1373,60 @@
 # GFX12: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1680,46 +1723,60 @@
 # GFX12: v_frexp_mant_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xc0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xc0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_log_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_log_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1932,46 +1989,60 @@
 # GFX12: v_not_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_rcp_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xaa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2142,46 +2213,60 @@
 # GFX12: v_rndne_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xa3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_rsq_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xae,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2352,46 +2437,60 @@
 # GFX12: v_sin_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xb5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX12: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_sqrt_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index eccd691855774e..a9474c7c4fe7f0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 # GFX12: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
@@ -6,16 +7,20 @@
 # GFX12: v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -306,16 +311,20 @@
 # GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_exp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_exp_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -330,16 +339,20 @@
 # GFX12: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -414,16 +427,20 @@
 # GFX12: v_frexp_mant_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xc0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xc0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX12: v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_log_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_log_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -462,16 +479,20 @@
 # GFX12: v_not_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb7,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xb7,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rcp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_rcp_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xaa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -522,16 +543,20 @@
 # GFX12: v_rndne_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xa3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rsq_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_rsq_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xae,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -576,16 +601,20 @@
 # GFX12: v_sin_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xb5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX12: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sqrt_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_sqrt_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]

From 59762a0ecf64cbf6ac20c41ae75666cd87519f26 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 29 Aug 2024 12:08:33 -0700
Subject: [PATCH 45/72] [RISCV] Add coverage for <3 x float> reduction with
 neutral start

We can do slightly better on the neutral value when we have nsz.
---
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   | 237 +++++++++++-------
 1 file changed, 141 insertions(+), 96 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 566c9070eab512..5d5807cbadbad5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -489,6 +489,51 @@ define float @vreduce_ord_fadd_v7f32(ptr %x, float %s) {
   ret float %red
 }
 
+define float @vreduce_fadd_v7f32_neutralstart(ptr %x) {
+; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vfredusum.vs v8, v8, v10
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %v = load <7 x float>, ptr %x
+  %red = call reassoc float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v)
+  ret float %red
+}
+
+define float @vreduce_fadd_v7f32_neutralstart_nsz(ptr %x) {
+; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart_nsz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vfredosum.vs v8, v8, v10
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %v = load <7 x float>, ptr %x
+  %red = call nsz float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v)
+  ret float %red
+}
+
+define float @vreduce_fadd_v7f32_neutralstart_fast(ptr %x) {
+; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vfredusum.vs v8, v8, v10
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %v = load <7 x float>, ptr %x
+  %red = call fast float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v)
+  ret float %red
+}
+
 
 declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
 
@@ -1683,12 +1728,12 @@ define float @vreduce_fminimum_v2f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB104_2
+; CHECK-NEXT:    beqz a0, .LBB107_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB104_2:
+; CHECK-NEXT:  .LBB107_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1719,12 +1764,12 @@ define float @vreduce_fminimum_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB106_2
+; CHECK-NEXT:    beqz a0, .LBB109_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB106_2:
+; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1758,12 +1803,12 @@ define float @vreduce_fminimum_v7f32(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10, v0.t
-; CHECK-NEXT:    beqz a0, .LBB108_2
+; CHECK-NEXT:    beqz a0, .LBB111_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:  .LBB111_2:
 ; CHECK-NEXT:    lui a0, 522240
 ; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v10
@@ -1798,12 +1843,12 @@ define float @vreduce_fminimum_v8f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10
-; CHECK-NEXT:    beqz a0, .LBB110_2
+; CHECK-NEXT:    beqz a0, .LBB113_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB110_2:
+; CHECK-NEXT:  .LBB113_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1834,12 +1879,12 @@ define float @vreduce_fminimum_v16f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v12, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v12
-; CHECK-NEXT:    beqz a0, .LBB112_2
+; CHECK-NEXT:    beqz a0, .LBB115_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB112_2:
+; CHECK-NEXT:  .LBB115_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1871,12 +1916,12 @@ define float @vreduce_fminimum_v32f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB114_2
+; CHECK-NEXT:    beqz a0, .LBB117_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB114_2:
+; CHECK-NEXT:  .LBB117_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1926,15 +1971,15 @@ define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB116_2
+; CHECK-NEXT:    beqz a0, .LBB119_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB116_3
-; CHECK-NEXT:  .LBB116_2:
+; CHECK-NEXT:    j .LBB119_3
+; CHECK-NEXT:  .LBB119_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB116_3:
+; CHECK-NEXT:  .LBB119_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2048,15 +2093,15 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB118_2
+; CHECK-NEXT:    beqz a0, .LBB121_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB118_3
-; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    j .LBB121_3
+; CHECK-NEXT:  .LBB121_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB118_3:
+; CHECK-NEXT:  .LBB121_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2102,12 +2147,12 @@ define double @vreduce_fminimum_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB120_2
+; CHECK-NEXT:    beqz a0, .LBB123_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI120_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI120_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI123_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI123_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB120_2:
+; CHECK-NEXT:  .LBB123_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2138,12 +2183,12 @@ define double @vreduce_fminimum_v4f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10
-; CHECK-NEXT:    beqz a0, .LBB122_2
+; CHECK-NEXT:    beqz a0, .LBB125_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI122_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI122_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI125_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI125_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB122_2:
+; CHECK-NEXT:  .LBB125_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2174,12 +2219,12 @@ define double @vreduce_fminimum_v8f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v12, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v12
-; CHECK-NEXT:    beqz a0, .LBB124_2
+; CHECK-NEXT:    beqz a0, .LBB127_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI124_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI124_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI127_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI127_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB124_2:
+; CHECK-NEXT:  .LBB127_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2210,12 +2255,12 @@ define double @vreduce_fminimum_v16f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB126_2
+; CHECK-NEXT:    beqz a0, .LBB129_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI126_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI126_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI129_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI129_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB126_2:
+; CHECK-NEXT:  .LBB129_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2263,15 +2308,15 @@ define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB128_2
+; CHECK-NEXT:    beqz a0, .LBB131_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI128_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI128_0)(a0)
-; CHECK-NEXT:    j .LBB128_3
-; CHECK-NEXT:  .LBB128_2:
+; CHECK-NEXT:    lui a0, %hi(.LCPI131_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI131_0)(a0)
+; CHECK-NEXT:    j .LBB131_3
+; CHECK-NEXT:  .LBB131_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB128_3:
+; CHECK-NEXT:  .LBB131_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2383,15 +2428,15 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB130_2
+; CHECK-NEXT:    beqz a0, .LBB133_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI130_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI130_0)(a0)
-; CHECK-NEXT:    j .LBB130_3
-; CHECK-NEXT:  .LBB130_2:
+; CHECK-NEXT:    lui a0, %hi(.LCPI133_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI133_0)(a0)
+; CHECK-NEXT:    j .LBB133_3
+; CHECK-NEXT:  .LBB133_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB130_3:
+; CHECK-NEXT:  .LBB133_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2436,12 +2481,12 @@ define float @vreduce_fmaximum_v2f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB132_2
+; CHECK-NEXT:    beqz a0, .LBB135_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB132_2:
+; CHECK-NEXT:  .LBB135_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2472,12 +2517,12 @@ define float @vreduce_fmaximum_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB134_2
+; CHECK-NEXT:    beqz a0, .LBB137_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB134_2:
+; CHECK-NEXT:  .LBB137_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2511,12 +2556,12 @@ define float @vreduce_fmaximum_v7f32(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10, v0.t
-; CHECK-NEXT:    beqz a0, .LBB136_2
+; CHECK-NEXT:    beqz a0, .LBB139_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB136_2:
+; CHECK-NEXT:  .LBB139_2:
 ; CHECK-NEXT:    lui a0, 1046528
 ; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v10
@@ -2551,12 +2596,12 @@ define float @vreduce_fmaximum_v8f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10
-; CHECK-NEXT:    beqz a0, .LBB138_2
+; CHECK-NEXT:    beqz a0, .LBB141_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB138_2:
+; CHECK-NEXT:  .LBB141_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2587,12 +2632,12 @@ define float @vreduce_fmaximum_v16f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v12, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v12
-; CHECK-NEXT:    beqz a0, .LBB140_2
+; CHECK-NEXT:    beqz a0, .LBB143_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB140_2:
+; CHECK-NEXT:  .LBB143_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2624,12 +2669,12 @@ define float @vreduce_fmaximum_v32f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB142_2
+; CHECK-NEXT:    beqz a0, .LBB145_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB142_2:
+; CHECK-NEXT:  .LBB145_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2679,15 +2724,15 @@ define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB144_2
+; CHECK-NEXT:    beqz a0, .LBB147_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB144_3
-; CHECK-NEXT:  .LBB144_2:
+; CHECK-NEXT:    j .LBB147_3
+; CHECK-NEXT:  .LBB147_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB144_3:
+; CHECK-NEXT:  .LBB147_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2801,15 +2846,15 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB146_2
+; CHECK-NEXT:    beqz a0, .LBB149_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB146_3
-; CHECK-NEXT:  .LBB146_2:
+; CHECK-NEXT:    j .LBB149_3
+; CHECK-NEXT:  .LBB149_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB146_3:
+; CHECK-NEXT:  .LBB149_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2855,12 +2900,12 @@ define double @vreduce_fmaximum_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v9, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v9
-; CHECK-NEXT:    beqz a0, .LBB148_2
+; CHECK-NEXT:    beqz a0, .LBB151_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI148_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI148_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI151_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI151_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB148_2:
+; CHECK-NEXT:  .LBB151_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2891,12 +2936,12 @@ define double @vreduce_fmaximum_v4f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v10, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v10
-; CHECK-NEXT:    beqz a0, .LBB150_2
+; CHECK-NEXT:    beqz a0, .LBB153_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI150_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI150_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI153_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI153_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB150_2:
+; CHECK-NEXT:  .LBB153_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2927,12 +2972,12 @@ define double @vreduce_fmaximum_v8f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v12, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v12
-; CHECK-NEXT:    beqz a0, .LBB152_2
+; CHECK-NEXT:    beqz a0, .LBB155_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI152_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI152_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI155_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI155_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB152_2:
+; CHECK-NEXT:  .LBB155_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2963,12 +3008,12 @@ define double @vreduce_fmaximum_v16f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB154_2
+; CHECK-NEXT:    beqz a0, .LBB157_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI154_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI154_0)(a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI157_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI157_0)(a0)
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB154_2:
+; CHECK-NEXT:  .LBB157_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -3016,15 +3061,15 @@ define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB156_2
+; CHECK-NEXT:    beqz a0, .LBB159_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI156_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI156_0)(a0)
-; CHECK-NEXT:    j .LBB156_3
-; CHECK-NEXT:  .LBB156_2:
+; CHECK-NEXT:    lui a0, %hi(.LCPI159_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI159_0)(a0)
+; CHECK-NEXT:    j .LBB159_3
+; CHECK-NEXT:  .LBB159_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB156_3:
+; CHECK-NEXT:  .LBB159_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -3136,15 +3181,15 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
-; CHECK-NEXT:    beqz a0, .LBB158_2
+; CHECK-NEXT:    beqz a0, .LBB161_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    lui a0, %hi(.LCPI158_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI158_0)(a0)
-; CHECK-NEXT:    j .LBB158_3
-; CHECK-NEXT:  .LBB158_2:
+; CHECK-NEXT:    lui a0, %hi(.LCPI161_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI161_0)(a0)
+; CHECK-NEXT:    j .LBB161_3
+; CHECK-NEXT:  .LBB161_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB158_3:
+; CHECK-NEXT:  .LBB161_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0

From d5c292d8ef590f64d26c16d12afebb6ad7f50373 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Aug 2024 12:35:50 -0700
Subject: [PATCH 46/72] [GISel][RISCV] Correctly handle scalable vector
 shuffles of pointer vectors in IRTranslator. (#106580)

---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 11 ++++-----
 llvm/lib/CodeGen/MachineVerifier.cpp          |  4 ++--
 .../GlobalISel/irtranslator/shufflevector.ll  | 23 +++++++++++++++++--
 .../MachineVerifier/test_g_splat_vector.mir   |  4 ++--
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 968d0a2a5c75e4..b290d7fb4ce4a1 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3183,15 +3183,14 @@ bool IRTranslator::translateExtractElement(const User &U,
 
 bool IRTranslator::translateShuffleVector(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
-  // A ShuffleVector that has operates on scalable vectors is a splat vector
-  // where the value of the splat vector is the 0th element of the first
-  // operand, since the index mask operand is the zeroinitializer (undef and
+  // A ShuffleVector that operates on scalable vectors is a splat vector where
+  // the value of the splat vector is the 0th element of the first operand,
+  // since the index mask operand is the zeroinitializer (undef and
   // poison are treated as zeroinitializer here).
   if (U.getOperand(0)->getType()->isScalableTy()) {
-    Value *Op0 = U.getOperand(0);
+    Register Val = getOrCreateVReg(*U.getOperand(0));
     auto SplatVal = MIRBuilder.buildExtractVectorElementConstant(
-        LLT::scalar(Op0->getType()->getScalarSizeInBits()),
-        getOrCreateVReg(*Op0), 0);
+        MRI->getType(Val).getElementType(), Val, 0);
     MIRBuilder.buildSplatVector(getOrCreateVReg(U), SplatVal);
     return true;
   }
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 5e9bb4c27ffbdf..759201ed9dadc7 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1835,8 +1835,8 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       break;
     }
 
-    if (!SrcTy.isScalar()) {
-      report("Source type must be a scalar", MI);
+    if (!SrcTy.isScalar() && !SrcTy.isPointer()) {
+      report("Source type must be a scalar or pointer", MI);
       break;
     }
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll
index 7ea67073bc28d2..89c7bfe81d5f98 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll
@@ -1770,5 +1770,24 @@ define <vscale x 16 x i64> @shufflevector_nxv16i64_2(<vscale x 16 x i64> %a) {
   ret <vscale x 16 x i64> %b
 }
 
-
-
+define <vscale x 1 x ptr> @shufflevector_nxv1p0_0() {
+  ; RV32-LABEL: name: shufflevector_nxv1p0_0
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 1 x p0>) = G_IMPLICIT_DEF
+  ; RV32-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32-NEXT:   [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](<vscale x 1 x p0>), [[C]](s32)
+  ; RV32-NEXT:   [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x p0>) = G_SPLAT_VECTOR [[EVEC]](p0)
+  ; RV32-NEXT:   $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: shufflevector_nxv1p0_0
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 1 x p0>) = G_IMPLICIT_DEF
+  ; RV64-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; RV64-NEXT:   [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](<vscale x 1 x p0>), [[C]](s64)
+  ; RV64-NEXT:   [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x p0>) = G_SPLAT_VECTOR [[EVEC]](p0)
+  ; RV64-NEXT:   $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %a = shufflevector <vscale x 1 x ptr> poison, <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+  ret <vscale x 1 x ptr> %a
+}
diff --git a/llvm/test/MachineVerifier/test_g_splat_vector.mir b/llvm/test/MachineVerifier/test_g_splat_vector.mir
index 00074349776fa7..a5bde496a3f22c 100644
--- a/llvm/test/MachineVerifier/test_g_splat_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_splat_vector.mir
@@ -16,10 +16,10 @@ body:             |
     ; CHECK: Destination type must be a scalable vector
     %4:_(<2 x s32>) = G_SPLAT_VECTOR %0
 
-    ; CHECK: Source type must be a scalar
+    ; CHECK: Source type must be a scalar or pointer
     %5:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1
 
-    ; CHECK: Source type must be a scalar
+    ; CHECK: Source type must be a scalar or pointer
     %6:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %2
 
     ; CHECK: Element type of the destination must be the same size or smaller than the source type

From aeedab77b596f858b0c53923657fc8c190d48ea8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 29 Aug 2024 12:32:25 -0700
Subject: [PATCH 47/72] [SLP]Correctly decide if the non-power-of-2 number of
 stores can be vectorized.

Need to consider the maximum type size in the graph before doing attempt
for the vectorization of non-power-of-2 number of elements, which may be
  less than MinVF.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 +-
 .../Transforms/SLPVectorizer/X86/odd_store.ll | 62 ++++++++-----------
 2 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e77db3cbd81fe5..775fa9ba75cfb7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16476,8 +16476,9 @@ bool SLPVectorizerPass::vectorizeStores(
         // First try vectorizing with a non-power-of-2 VF. At the moment, only
         // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
         // lanes are used.
-        unsigned CandVF = Operands.size();
-        if (has_single_bit(CandVF + 1) && CandVF <= MaxRegVF)
+        unsigned CandVF =
+            std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
+        if (has_single_bit(CandVF + 1))
           NonPowerOf2VF = CandVF;
       }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
index 5f2c42d5c2dec8..f1989712657203 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -9,43 +9,31 @@
 ;}
 
 define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
-; NON-POW2-LABEL: @foo(
-; NON-POW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer
-; NON-POW2-NEXT:    [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]]
-; NON-POW2-NEXT:    [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double>
-; NON-POW2-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
-; NON-POW2-NEXT:    [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8>
-; NON-POW2-NEXT:    store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1
-; NON-POW2-NEXT:    ret i32 undef
-;
-; POW2-ONLY-LABEL: @foo(
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fpext float [[TMP3]] to double
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
-; POW2-ONLY-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
-; POW2-ONLY-NEXT:    store i8 [[TMP6]], ptr [[A:%.*]], align 1
-; POW2-ONLY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
-; POW2-ONLY-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
-; POW2-ONLY-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
-; POW2-ONLY-NEXT:    [[TMP10:%.*]] = fpext float [[TMP9]] to double
-; POW2-ONLY-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
-; POW2-ONLY-NEXT:    [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
-; POW2-ONLY-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; POW2-ONLY-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
-; POW2-ONLY-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
-; POW2-ONLY-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
-; POW2-ONLY-NEXT:    [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
-; POW2-ONLY-NEXT:    [[TMP17:%.*]] = fpext float [[TMP16]] to double
-; POW2-ONLY-NEXT:    [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
-; POW2-ONLY-NEXT:    [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
-; POW2-ONLY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; POW2-ONLY-NEXT:    store i8 [[TMP19]], ptr [[TMP20]], align 1
-; POW2-ONLY-NEXT:    ret i32 undef
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[TMP3]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
+; CHECK-NEXT:    store i8 [[TMP6]], ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[TMP9]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
+; CHECK-NEXT:    [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fpext float [[TMP16]] to double
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; CHECK-NEXT:    store i8 [[TMP19]], ptr [[TMP20]], align 1
+; CHECK-NEXT:    ret i32 undef
 ;
   %1 = getelementptr inbounds float, ptr %B, i64 10
   %2 = load float, ptr %1, align 4

From f08f9cd9713332c939889ab34f5355b77f12f82b Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Thu, 29 Aug 2024 12:56:15 -0700
Subject: [PATCH 48/72] [HWASan] remove incorrectly inferred attributes
 (#106565)

assume all functions used in a HWASan module potentially touch shadow
memory (and short granules).
---
 .../Instrumentation/HWAddressSanitizer.cpp    |  26 +-
 .../HWAddressSanitizer/RISCV/alloca.ll        | 156 +++++-----
 .../HWAddressSanitizer/RISCV/basic.ll         | 270 +++++++++---------
 .../HWAddressSanitizer/alloca.ll              | 160 ++++++-----
 .../HWAddressSanitizer/attrinfer.ll           |  14 +
 .../HWAddressSanitizer/basic.ll               | 208 +++++++-------
 .../HWAddressSanitizer/fixed-shadow.ll        |   4 +-
 .../hwasan-pass-second-run.ll                 |   4 +-
 .../HWAddressSanitizer/mem-attr.ll            |   2 +-
 9 files changed, 437 insertions(+), 407 deletions(-)
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 95433a216b168d..f5faf117f69bdd 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -598,6 +598,24 @@ void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   TargetTriple = Triple(M.getTargetTriple());
 
+  for (auto &F : M.functions()) {
+    // Remove memory attributes that are invalid with HWASan.
+    // HWASan checks read from shadow, which invalidates memory(argmem: *)
+    // Short granule checks on function arguments read from the argument memory
+    // (last byte of the granule), which invalidates writeonly.
+    //
+    // This is not only true for sanitized functions, because AttrInfer can
+    // infer those attributes on libc functions, which is not true if those
+    // are instrumented (Android) or intercepted.
+
+    // nobuiltin makes sure later passes don't restore assumptions about
+    // the function.
+    F.addFnAttr(llvm::Attribute::NoBuiltin);
+    F.removeFnAttr(llvm::Attribute::Memory);
+    for (auto &A : F.args())
+      A.removeAttr(llvm::Attribute::WriteOnly);
+  }
+
   // x86_64 currently has two modes:
   // - Intel LAM (default)
   // - pointer aliasing (heap only)
@@ -1622,14 +1640,6 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
-  // Remove memory attributes that are about to become invalid.
-  // HWASan checks read from shadow, which invalidates memory(argmem: *)
-  // Short granule checks on function arguments read from the argument memory
-  // (last byte of the granule), which invalidates writeonly.
-  F.removeFnAttr(llvm::Attribute::Memory);
-  for (auto &A : F.args())
-    A.removeAttr(llvm::Attribute::WriteOnly);
-
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
index 23b1043c700165..032168e28421b9 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
@@ -33,7 +33,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca
-; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca
-; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -131,15 +131,17 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
+; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -147,15 +149,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; DYNAMIC-SHADOW: [[META9]] = !{null}
-; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; DYNAMIC-SHADOW: [[META10]] = !{null}
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -163,13 +166,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META9]] = !{null}
-; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META10]] = !{null}
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
index 9cebe2e845f772..dc2d11cb4b3538 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
@@ -9,8 +9,6 @@
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW
 
-; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor]
-; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux"
@@ -32,7 +30,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -68,7 +66,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -88,7 +86,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -108,10 +106,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -120,13 +118,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -145,7 +143,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -165,10 +163,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -177,13 +175,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -212,7 +210,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -248,7 +246,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -268,7 +266,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -288,10 +286,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -300,13 +298,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -325,7 +323,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -345,10 +343,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -357,13 +355,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -392,7 +390,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -428,7 +426,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -448,7 +446,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -468,10 +466,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -480,13 +478,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -505,7 +503,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -525,10 +523,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -537,13 +535,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -572,7 +570,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -608,7 +606,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -628,7 +626,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -648,10 +646,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -660,13 +658,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -685,7 +683,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -705,10 +703,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -717,13 +715,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -752,7 +750,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -788,7 +786,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -808,7 +806,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -828,10 +826,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -840,13 +838,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -865,7 +863,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -885,10 +883,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -897,13 +895,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1013,7 +1011,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1049,7 +1047,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1069,7 +1067,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1089,10 +1087,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1101,13 +1099,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1126,7 +1124,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1146,10 +1144,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1158,13 +1156,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1193,7 +1191,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1229,7 +1227,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1249,7 +1247,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1269,10 +1267,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1281,13 +1279,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1306,7 +1304,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1326,10 +1324,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1338,13 +1336,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1373,7 +1371,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1409,7 +1407,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1429,7 +1427,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1449,10 +1447,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1461,13 +1459,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1486,7 +1484,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1506,10 +1504,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1518,13 +1516,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1553,7 +1551,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1589,7 +1587,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1609,7 +1607,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1629,10 +1627,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1641,13 +1639,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1666,7 +1664,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1686,10 +1684,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1698,13 +1696,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1733,7 +1731,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1769,7 +1767,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1789,7 +1787,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1809,10 +1807,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1821,13 +1819,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1846,7 +1844,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1866,10 +1864,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1878,13 +1876,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -2060,43 +2058,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -2108,43 +2106,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 4bd23ea76c159b..0f74736dc232ea 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -34,7 +34,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -112,13 +112,13 @@ entry:
 
 define void @test_vscale_alloca() sanitize_hwaddress {
 ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] {
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; DYNAMIC-SHADOW-NEXT:    ret void
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] {
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; ZERO-BASED-SHADOW-NEXT:    ret void
@@ -150,15 +150,17 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
+; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -166,15 +168,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; DYNAMIC-SHADOW: [[META9]] = !{null}
-; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; DYNAMIC-SHADOW: [[META10]] = !{null}
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -182,13 +185,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META9]] = !{null}
-; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META10]] = !{null}
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
new file mode 100644
index 00000000000000..eeb51aeda1000b
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
@@ -0,0 +1,14 @@
+; Standard library functions get inferred attributes, some of which are not
+; correct when building for HWASan.
+
+; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0
+
+attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" }
+
+; CHECK-NOT: memory(argmem: write)
+; CHECK: nobuiltin
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
index 4212293f42545e..1e74f2891a2e3c 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1542,43 +1542,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -1590,43 +1590,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
index 980189c5607f31..f72fc0a9720e4a 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
@@ -194,7 +194,7 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
@@ -206,7 +206,7 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
index 00614b603fe799..2635dfb75ed98f 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @__hwasan_shadow = external global [0 x i8]
 ;.
 define i8 @test_load8(ptr %a) sanitize_hwaddress {
-; CHECK: Function Attrs: sanitize_hwaddress
+; CHECK: Function Attrs: nobuiltin sanitize_hwaddress
 ; CHECK-LABEL: define i8 @test_load8
 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -33,7 +33,7 @@ entry:
   ret i8 %b
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[META0]] = !{ptr @hwasan.note}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
index c0e370f20213aa..919eacb2951f5e 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -11,5 +11,5 @@ entry:
   ret void
 }
 
-; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
+; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable }
 attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }

From 0141a3cde4d8f2c8ff9e957f981f37e65a69a325 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Thu, 29 Aug 2024 21:59:03 +0200
Subject: [PATCH 49/72] [analyzer] Fix nullptr dereference for symbols from
 pointer invalidation (#106568)

As reported in
https://github.com/llvm/llvm-project/pull/105648#issuecomment-2317144635
commit 08ad8dc7154bf3ab79f750e6d5fb7df597c7601a
introduced a nullptr dereference in the case when store contains a
binding to a symbol that has no origin region associated with it, such
as the symbol generated when a pointer is passed to an opaque function.
---
 .../Checkers/StackAddrEscapeChecker.cpp        |  5 ++++-
 clang/test/Analysis/stack-addr-ps.c            | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
index 20232405d572d2..ec577c36188e6c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
@@ -308,7 +308,10 @@ static const MemSpaceRegion *getStackOrGlobalSpaceRegion(const MemRegion *R) {
 const MemRegion *getOriginBaseRegion(const MemRegion *Reg) {
   Reg = Reg->getBaseRegion();
   while (const auto *SymReg = dyn_cast<SymbolicRegion>(Reg)) {
-    Reg = SymReg->getSymbol()->getOriginRegion()->getBaseRegion();
+    const auto *OriginReg = SymReg->getSymbol()->getOriginRegion();
+    if (!OriginReg)
+      break;
+    Reg = OriginReg->getBaseRegion();
   }
   return Reg;
 }
diff --git a/clang/test/Analysis/stack-addr-ps.c b/clang/test/Analysis/stack-addr-ps.c
index 138b8c16b02bde..7d7294455f1dbe 100644
--- a/clang/test/Analysis/stack-addr-ps.c
+++ b/clang/test/Analysis/stack-addr-ps.c
@@ -126,3 +126,21 @@ void caller_for_nested_leaking() {
   int *ptr = 0;
   caller_mid_for_nested_leaking(&ptr);
 }
+
+// This used to crash StackAddrEscapeChecker because
+// it features a symbol conj_$1{struct c *, LC1, S763, #1}
+// that has no origin region.
+struct a {
+  int member;
+};
+
+struct c {
+  struct a *nested_ptr;
+};
+void opaque(struct c*);
+struct c* get_c(void);
+void no_crash_for_symbol_without_origin_region(void) {
+  struct c *ptr = get_c();
+  opaque(ptr);
+  ptr->nested_ptr->member++;
+} // No crash at the end of the function

From 66927fb95abef9327b453d7213c5df7d641269be Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Thu, 29 Aug 2024 13:06:21 -0700
Subject: [PATCH 50/72] Revert "[HWASan] remove incorrectly inferred
 attributes" (#106622)

Reverts llvm/llvm-project#106565

Broke clang tests
---
 .../Instrumentation/HWAddressSanitizer.cpp    |  26 +-
 .../HWAddressSanitizer/RISCV/alloca.ll        | 156 +++++-----
 .../HWAddressSanitizer/RISCV/basic.ll         | 270 +++++++++---------
 .../HWAddressSanitizer/alloca.ll              | 160 +++++------
 .../HWAddressSanitizer/attrinfer.ll           |  14 -
 .../HWAddressSanitizer/basic.ll               | 208 +++++++-------
 .../HWAddressSanitizer/fixed-shadow.ll        |   4 +-
 .../hwasan-pass-second-run.ll                 |   4 +-
 .../HWAddressSanitizer/mem-attr.ll            |   2 +-
 9 files changed, 407 insertions(+), 437 deletions(-)
 delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index f5faf117f69bdd..95433a216b168d 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -598,24 +598,6 @@ void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   TargetTriple = Triple(M.getTargetTriple());
 
-  for (auto &F : M.functions()) {
-    // Remove memory attributes that are invalid with HWASan.
-    // HWASan checks read from shadow, which invalidates memory(argmem: *)
-    // Short granule checks on function arguments read from the argument memory
-    // (last byte of the granule), which invalidates writeonly.
-    //
-    // This is not only true for sanitized functions, because AttrInfer can
-    // infer those attributes on libc functions, which is not true if those
-    // are instrumented (Android) or intercepted.
-
-    // nobuiltin makes sure later passes don't restore assumptions about
-    // the function.
-    F.addFnAttr(llvm::Attribute::NoBuiltin);
-    F.removeFnAttr(llvm::Attribute::Memory);
-    for (auto &A : F.args())
-      A.removeAttr(llvm::Attribute::WriteOnly);
-  }
-
   // x86_64 currently has two modes:
   // - Intel LAM (default)
   // - pointer aliasing (heap only)
@@ -1640,6 +1622,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
+  // Remove memory attributes that are about to become invalid.
+  // HWASan checks read from shadow, which invalidates memory(argmem: *)
+  // Short granule checks on function arguments read from the argument memory
+  // (last byte of the granule), which invalidates writeonly.
+  F.removeFnAttr(llvm::Attribute::Memory);
+  for (auto &A : F.args())
+    A.removeAttr(llvm::Attribute::WriteOnly);
+
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
index 032168e28421b9..23b1043c700165 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
@@ -33,7 +33,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca
-; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca
-; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -131,17 +131,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -149,16 +147,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; DYNAMIC-SHADOW: [[META10]] = !{null}
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; DYNAMIC-SHADOW: [[META9]] = !{null}
+; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -166,14 +163,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META10]] = !{null}
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META9]] = !{null}
+; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
index dc2d11cb4b3538..9cebe2e845f772 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
@@ -9,6 +9,8 @@
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW
 
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux"
@@ -30,7 +32,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -66,7 +68,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -86,7 +88,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -106,10 +108,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -118,13 +120,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -143,7 +145,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -163,10 +165,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -175,13 +177,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -210,7 +212,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -246,7 +248,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -266,7 +268,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -286,10 +288,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -298,13 +300,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -323,7 +325,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -343,10 +345,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -355,13 +357,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -390,7 +392,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -426,7 +428,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -446,7 +448,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -466,10 +468,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +480,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -503,7 +505,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -523,10 +525,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -535,13 +537,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +572,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -606,7 +608,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -626,7 +628,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -646,10 +648,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -658,13 +660,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -683,7 +685,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -703,10 +705,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -715,13 +717,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -750,7 +752,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -786,7 +788,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -806,7 +808,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -826,10 +828,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -838,13 +840,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -863,7 +865,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -883,10 +885,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -895,13 +897,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1011,7 +1013,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1047,7 +1049,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1067,7 +1069,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1087,10 +1089,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1099,13 +1101,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1124,7 +1126,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1144,10 +1146,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1156,13 +1158,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1191,7 +1193,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1227,7 +1229,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1247,7 +1249,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1267,10 +1269,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1279,13 +1281,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1304,7 +1306,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1324,10 +1326,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1336,13 +1338,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1371,7 +1373,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1407,7 +1409,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1427,7 +1429,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1447,10 +1449,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1459,13 +1461,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1484,7 +1486,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1504,10 +1506,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1516,13 +1518,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1551,7 +1553,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1587,7 +1589,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1607,7 +1609,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1627,10 +1629,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1639,13 +1641,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1664,7 +1666,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1684,10 +1686,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1696,13 +1698,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1731,7 +1733,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1767,7 +1769,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1787,7 +1789,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1807,10 +1809,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1819,13 +1821,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1844,7 +1846,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1864,10 +1866,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1876,13 +1878,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -2058,43 +2060,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -2106,43 +2108,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 0f74736dc232ea..4bd23ea76c159b 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -34,7 +34,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -112,13 +112,13 @@ entry:
 
 define void @test_vscale_alloca() sanitize_hwaddress {
 ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] {
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; DYNAMIC-SHADOW-NEXT:    ret void
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] {
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; ZERO-BASED-SHADOW-NEXT:    ret void
@@ -150,17 +150,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -168,16 +166,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; DYNAMIC-SHADOW: [[META10]] = !{null}
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; DYNAMIC-SHADOW: [[META9]] = !{null}
+; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -185,14 +182,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META10]] = !{null}
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META9]] = !{null}
+; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
deleted file mode 100644
index eeb51aeda1000b..00000000000000
--- a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; Standard library functions get inferred attributes, some of which are not
-; correct when building for HWASan.
-
-; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android10000"
-
-declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0
-
-attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" }
-
-; CHECK-NOT: memory(argmem: write)
-; CHECK: nobuiltin
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
index 1e74f2891a2e3c..4212293f42545e 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1542,43 +1542,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -1590,43 +1590,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
index f72fc0a9720e4a..980189c5607f31 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
@@ -194,7 +194,7 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
@@ -206,7 +206,7 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
index 2635dfb75ed98f..00614b603fe799 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @__hwasan_shadow = external global [0 x i8]
 ;.
 define i8 @test_load8(ptr %a) sanitize_hwaddress {
-; CHECK: Function Attrs: nobuiltin sanitize_hwaddress
+; CHECK: Function Attrs: sanitize_hwaddress
 ; CHECK-LABEL: define i8 @test_load8
 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -33,7 +33,7 @@ entry:
   ret i8 %b
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress }
+; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[META0]] = !{ptr @hwasan.note}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
index 919eacb2951f5e..c0e370f20213aa 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -11,5 +11,5 @@ entry:
   ret void
 }
 
-; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable }
+; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
 attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }

From c4906588ce47de33d59bcd95f3e82ce2c3e61c23 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 29 Aug 2024 21:19:59 +0100
Subject: [PATCH 51/72] [VPlan] Use skipCostComputation when pre-computing
 induction costs.

This ensures we skip any instructions identified to be ignored by the
legacy cost model as well. Fixes a divergence between legacy and
VPlan-based cost model.

Fixes https://github.com/llvm/llvm-project/issues/106417.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +-
 .../LoopVectorize/RISCV/induction-costs.ll    | 192 ++++++++++++++++++
 2 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4cc75e2e754603..6babfd1eee9108 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7113,7 +7113,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
       IVInsts.push_back(CI);
     }
     for (Instruction *IVInst : IVInsts) {
-      if (!CostCtx.SkipCostComputation.insert(IVInst).second)
+      if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
         continue;
       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
       LLVM_DEBUG({
@@ -7121,6 +7121,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                << ": induction instruction " << *IVInst << "\n";
       });
       Cost += InductionCost;
+      CostCtx.SkipCostComputation.insert(IVInst);
     }
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
new file mode 100644
index 00000000000000..bee7bb7bd61622
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Test case for https://github.com/llvm/llvm-project/issues/106417.
+define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
+; CHECK-LABEL: define void @skip_free_iv_truncate(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[X_I32:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[X_I64:%.*]] = sext i16 [[X]] to i64
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 -8
+; CHECK-NEXT:    [[SMAX20:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99)
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[SMAX20]], [[X_I64]]
+; CHECK-NEXT:    [[UMIN21:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[SMAX20]], [[UMIN21]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[X_I64]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[UMIN21]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 288, i64 [[TMP7]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99)
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[SMAX]], [[X_I64]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP9]], i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[SMAX]], [[UMIN]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], [[X_I64]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv i64 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[UMIN]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw i64 [[X_I64]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 6, i64 [[TMP13]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult ptr [[TMP16]], [[SCEVGEP]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[X_I64]], 3
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 24, i64 [[TMP13]])
+; CHECK-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sub i64 0, [[MUL_RESULT3]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ult ptr [[TMP21]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i1 [[TMP22]], [[MUL_OVERFLOW4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i64 [[TMP19]], -8
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT:    [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 24, i64 [[TMP13]])
+; CHECK-NEXT:    [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 0, [[MUL_RESULT7]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult ptr [[TMP26]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 [[TMP27]], [[MUL_OVERFLOW8]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i1 [[TMP18]], [[TMP23]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i1 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = shl nsw i64 [[X_I64]], 1
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP31]]
+; CHECK-NEXT:    [[SMAX10:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99)
+; CHECK-NEXT:    [[TMP32:%.*]] = sub i64 [[SMAX10]], [[X_I64]]
+; CHECK-NEXT:    [[UMIN11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP32]], i64 1)
+; CHECK-NEXT:    [[TMP33:%.*]] = sub i64 [[SMAX10]], [[UMIN11]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP33]], [[X_I64]]
+; CHECK-NEXT:    [[TMP35:%.*]] = udiv i64 [[TMP34]], 3
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[UMIN11]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 6
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[TMP31]]
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 2
+; CHECK-NEXT:    [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = shl nsw i64 [[X_I64]], 3
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP40]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP36]], 24
+; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], 8
+; CHECK-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = add nsw i64 [[TMP40]], -8
+; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP44]]
+; CHECK-NEXT:    [[SCEVGEP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP42]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP9]], [[SCEVGEP14]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP13]], [[SCEVGEP12]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[SCEVGEP9]], [[SCEVGEP16]]
+; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[SCEVGEP15]], [[SCEVGEP12]]
+; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
+; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
+; CHECK-NEXT:    [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X_I64]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP54:%.*]] = add <vscale x 8 x i64> [[TMP53]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = mul <vscale x 8 x i64> [[TMP54]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> [[DOTSPLAT]], [[TMP55]]
+; CHECK-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP56]], 8
+; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 3, [[TMP57]]
+; CHECK-NEXT:    [[DOTSPLATINSERT24:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP58]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT25:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT24]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr i16, ptr [[A]], <vscale x 8 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP59]], i32 2, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP52]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT25]]
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[ENTRY]] ], [ [[X_I64]], %[[VECTOR_SCEVCHECK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL23:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[ENTRY]] ], [ [[X_I32]], %[[VECTOR_SCEVCHECK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL23]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8
+; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64
+; CHECK-NEXT:    [[GEP_CONV:%.*]] = getelementptr i64, ptr [[INVARIANT_GEP]], i64 [[TMP62]]
+; CHECK-NEXT:    [[TMP63:%.*]] = load i64, ptr [[GEP_CONV]], align 8
+; CHECK-NEXT:    [[GEP_I16:%.*]] = getelementptr i16, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_I16]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 3
+; CHECK-NEXT:    [[TMP64]] = trunc i64 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], 99
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.i32 = sext i16 %x to i32
+  %x.i64 = sext i16 %x to i64
+  %invariant.gep = getelementptr i8, ptr %A, i64 -8
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %x.i64, %entry ], [ %iv.next, %loop ]
+  %iv.conv = phi i32 [ %x.i32, %entry ], [ %5, %loop ]
+  %gep.i64 = getelementptr i64, ptr %A, i64 %iv
+  %2 = load i64, ptr %gep.i64, align 8
+  %3 = sext i32 %iv.conv to i64
+  %gep.conv = getelementptr i64, ptr %invariant.gep, i64 %3
+  %4 = load i64, ptr %gep.conv, align 8
+  %gep.i16 = getelementptr i16, ptr %A, i64 %iv
+  store i16 0, ptr %gep.i16, align 2
+  %iv.next = add i64 %iv, 3
+  %5 = trunc i64 %iv.next to i32
+  %c = icmp slt i64 %iv, 99
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]}
+;.

From 1f0d545ec38ceaafa7ca94aa659be125bdcd721f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 29 Aug 2024 16:37:25 -0400
Subject: [PATCH 52/72] [libc++] Fix wraparound issue with -fsanitize=integer
 in string operator>> (#106263)

Fixes #106261
rdar://133991190
---
 libcxx/include/__type_traits/make_unsigned.h |  4 +---
 libcxx/include/istream                       | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index 282cd2d9113166..8757f451eb807b 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -86,12 +86,10 @@ template <class _Tp>
 using make_unsigned_t = __make_unsigned_t<_Tp>;
 #endif
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr __make_unsigned_t<_Tp> __to_unsigned_like(_Tp __x) noexcept {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __make_unsigned_t<_Tp> __to_unsigned_like(_Tp __x) _NOEXCEPT {
   return static_cast<__make_unsigned_t<_Tp> >(__x);
 }
-#endif
 
 template <class _Tp, class _Up>
 using __copy_unsigned_t = __conditional_t<is_unsigned<_Tp>::value, __make_unsigned_t<_Up>, _Up>;
diff --git a/libcxx/include/istream b/libcxx/include/istream
index d2b577a9ad9efc..7c65a24bc313d9 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -165,6 +165,7 @@ template <class Stream, class T>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_base_of.h>
+#include <__type_traits/make_unsigned.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
 #include <bitset>
@@ -1211,12 +1212,17 @@ operator>>(basic_istream<_CharT, _Traits>& __is, basic_string<_CharT, _Traits, _
     try {
 #endif
       __str.clear();
-      streamsize __n = __is.width();
-      if (__n <= 0)
-        __n = __str.max_size();
-      if (__n <= 0)
-        __n = numeric_limits<streamsize>::max();
-      streamsize __c            = 0;
+      using _Size              = typename basic_string<_CharT, _Traits, _Allocator>::size_type;
+      streamsize const __width = __is.width();
+      _Size const __max_size   = __str.max_size();
+      _Size __n;
+      if (__width <= 0) {
+        __n = __max_size;
+      } else {
+        __n = std::__to_unsigned_like(__width) < __max_size ? static_cast<_Size>(__width) : __max_size;
+      }
+
+      _Size __c                 = 0;
       const ctype<_CharT>& __ct = std::use_facet<ctype<_CharT> >(__is.getloc());
       while (__c < __n) {
         typename _Traits::int_type __i = __is.rdbuf()->sgetc();

From 049b60c5bb7e774b74772c6b89c72593f73a89b0 Mon Sep 17 00:00:00 2001
From: Tom Honermann <tom.honermann@intel.com>
Date: Thu, 29 Aug 2024 17:00:19 -0400
Subject: [PATCH 53/72] [NFC][Clang] Avoid potential null pointer dereferences
 in Sema::AddInitializerToDecl(). (#106235)

Control flow analysis performed by a static analysis tool revealed the
potential for null pointer dereferences to occur in conjunction with the
`Init` parameter in `Sema::AddInitializerToDecl()`. On entry to the
function, `Init` is required to be non-null as there are multiple
potential branches that unconditionally dereference it. However, there
were two places where `Init` is compared to null thus implying that
`Init` is expected to be null in some cases. These checks appear to be
purely defensive checks and thus unnecessary. Further, there were
several cases where code checked `Result`, a variable of type
`ExprResult`, for an invalid value, but did not check for a valid but
null value and then proceeded to unconditionally dereference the
potential null result. This change elides the unnecessary defensive
checks and changes some checks for an invalid result to instead branch
on an unusable result (either an invalid result or a valid but null
result).
---
 clang/lib/Sema/SemaDecl.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 4efa80778e71b9..6327ae9b99aa4c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13319,8 +13319,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   }
 
   // WebAssembly tables can't be used to initialise a variable.
-  if (Init && !Init->getType().isNull() &&
-      Init->getType()->isWebAssemblyTableType()) {
+  if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) {
     Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0;
     VDecl->setInvalidDecl();
     return;
@@ -13463,7 +13462,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   if (getLangOpts().DebuggerCastResultToId && DclT->isObjCObjectPointerType() &&
       Init->getType() == Context.UnknownAnyTy) {
     ExprResult Result = forceUnknownAnyToType(Init, Context.getObjCIdType());
-    if (Result.isInvalid()) {
+    if (!Result.isUsable()) {
       VDecl->setInvalidDecl();
       return;
     }
@@ -13491,7 +13490,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
             InitializationSequence Init(*this, Entity, Kind, MultiExprArg(E));
             return Init.Failed() ? ExprError() : E;
           });
-      if (Res.isInvalid()) {
+      if (!Res.isUsable()) {
         VDecl->setInvalidDecl();
       } else if (Res.get() != Args[Idx]) {
         Args[Idx] = Res.get();
@@ -13504,7 +13503,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
                                    /*TopLevelOfInitList=*/false,
                                    /*TreatUnavailableAsInvalid=*/false);
     ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Args, &DclT);
-    if (Result.isInvalid()) {
+    if (!Result.isUsable()) {
       // If the provided initializer fails to initialize the var decl,
       // we attach a recovery expr for better recovery.
       auto RecoveryExpr =
@@ -13528,8 +13527,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
                       InitSeq.step_begin()->Kind ==
                           InitializationSequence::SK_ParenthesizedListInit;
     QualType VDeclType = VDecl->getType();
-    if (Init && !Init->getType().isNull() &&
-        !Init->getType()->isDependentType() && !VDeclType->isDependentType() &&
+    if (!Init->getType().isNull() && !Init->getType()->isDependentType() &&
+        !VDeclType->isDependentType() &&
         Context.getAsIncompleteArrayType(VDeclType) &&
         Context.getAsIncompleteArrayType(Init->getType())) {
       // Bail out if it is not possible to deduce array size from the
@@ -13592,7 +13591,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   ExprResult Result =
       ActOnFinishFullExpr(Init, VDecl->getLocation(),
                           /*DiscardedValue*/ false, VDecl->isConstexpr());
-  if (Result.isInvalid()) {
+  if (!Result.isUsable()) {
     VDecl->setInvalidDecl();
     return;
   }

From 593526f3fb138069fc93b14d08320d0e3f67c707 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Aug 2024 14:02:26 -0700
Subject: [PATCH 54/72] [X86] Use MCRegister instead of int64_t in X86MCExpr.
 NFC (#106569)

---
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp |  6 +++---
 llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h   | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 864b7d8e769ab1..2b6b0ad16bcf76 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2789,7 +2789,7 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
       if (auto *RE = dyn_cast<X86MCExpr>(Expr)) {
         // Segment Register. Reset Expr and copy value to register.
         Expr = nullptr;
-        Reg = RE->getRegNo();
+        Reg = RE->getReg();
 
         // Check the register.
         if (Reg == X86::EIZ || Reg == X86::RIZ)
@@ -3052,7 +3052,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
       return true;
 
     // Check the register.
-    BaseReg = cast<X86MCExpr>(E)->getRegNo();
+    BaseReg = cast<X86MCExpr>(E)->getReg();
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
       return Error(BaseLoc, "eiz and riz can only be used as index registers",
                    SMRange(BaseLoc, EndLoc));
@@ -3079,7 +3079,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
           Warning(Loc, "scale factor without index register is ignored");
         Scale = 1;
       } else { // IndexReg Found.
-        IndexReg = cast<X86MCExpr>(E)->getRegNo();
+        IndexReg = cast<X86MCExpr>(E)->getReg();
 
         if (BaseReg == X86::RIP)
           return Error(Loc,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
index c159d30194cc64..37e15193b04357 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -26,16 +26,16 @@ namespace llvm {
 class X86MCExpr : public MCTargetExpr {
 
 private:
-  const int64_t RegNo; // All
+  const MCRegister Reg; // All
 
-  explicit X86MCExpr(int64_t R) : RegNo(R) {}
+  explicit X86MCExpr(MCRegister R) : Reg(R) {}
 
 public:
   /// @name Construction
   /// @{
 
-  static const X86MCExpr *create(int64_t RegNo, MCContext &Ctx) {
-    return new (Ctx) X86MCExpr(RegNo);
+  static const X86MCExpr *create(MCRegister Reg, MCContext &Ctx) {
+    return new (Ctx) X86MCExpr(Reg);
   }
 
   /// @}
@@ -43,14 +43,14 @@ class X86MCExpr : public MCTargetExpr {
   /// @{
 
   /// getSubExpr - Get the child of this expression.
-  int64_t getRegNo() const { return RegNo; }
+  MCRegister getReg() const { return Reg; }
 
   /// @}
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
     if (!MAI || MAI->getAssemblerDialect() == 0)
       OS << '%';
-    OS << X86ATTInstPrinter::getRegisterName(RegNo);
+    OS << X86ATTInstPrinter::getRegisterName(Reg);
   }
 
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
@@ -61,7 +61,7 @@ class X86MCExpr : public MCTargetExpr {
   bool inlineAssignedExpr() const override { return true; }
   bool isEqualTo(const MCExpr *X) const override {
     if (auto *E = dyn_cast<X86MCExpr>(X))
-      return getRegNo() == E->getRegNo();
+      return getReg() == E->getReg();
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {}

From 4ca817d0511b2c36b2f5d242e0c8f90a7a9c4f14 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Aug 2024 14:02:53 -0700
Subject: [PATCH 55/72] [GlobalISel] Add bail outs for scalable vectors to some
 combines. (#106496)

These combines call getNumElements() which isn't valid for scalable
vectors.
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  9 +++++
 .../GlobalISel/scalablevec-combiner-crash.ll  | 35 +++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1517ae707c8cff..df9c12bc9c97bd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2678,6 +2678,9 @@ bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
           MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) &&
          "Expected an insert/extract element op");
   LLT VecTy = MRI.getType(MI.getOperand(1).getReg());
+  if (VecTy.isScalableVector())
+    return false;
+
   unsigned IdxIdx =
       MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
   auto Idx = getIConstantVRegVal(MI.getOperand(IdxIdx).getReg(), MRI);
@@ -2961,6 +2964,10 @@ bool CombinerHelper::matchCombineInsertVecElts(
   Register DstReg = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(DstReg);
   assert(DstTy.isVector() && "Invalid G_INSERT_VECTOR_ELT?");
+
+  if (DstTy.isScalableVector())
+    return false;
+
   unsigned NumElts = DstTy.getNumElements();
   // If this MI is part of a sequence of insert_vec_elts, then
   // don't do the combine in the middle of the sequence.
@@ -4046,6 +4053,8 @@ bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI,
   // and find the source register that the index maps to.
   Register SrcVec = MI.getOperand(1).getReg();
   LLT SrcTy = MRI.getType(SrcVec);
+  if (SrcTy.isScalableVector())
+    return false;
 
   auto Cst = getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
   if (!Cst || Cst->Value.getZExtValue() >= SrcTy.getNumElements())
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll
new file mode 100644
index 00000000000000..8ce4b334a5134c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -global-isel -stop-after=riscv-prelegalizer-combiner | FileCheck %s
+
+; Make sure we don't crash in the prelegalizer combiner for scalable vector
+; insert and extracts.
+
+define <vscale x 1 x i1> @insertelement_nxv1i1_0(<vscale x 1 x i1> %x) {
+  ; CHECK-LABEL: name: insertelement_nxv1i1_0
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $v0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[C]](s1), [[C1]](s64)
+  ; CHECK-NEXT:   $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+  ; CHECK-NEXT:   PseudoRET implicit $v0
+  %a = insertelement <vscale x 1 x i1> %x, i1 0, i32 0
+  ret <vscale x 1 x i1> %a
+}
+
+define <vscale x 1 x i1> @shufflevector_nxv1i1_0(<vscale x 1 x i1> %x) {
+  ; CHECK-LABEL: name: shufflevector_nxv1i1_0
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $v0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s1) = G_EXTRACT_VECTOR_ELT [[COPY]](<vscale x 1 x s1>), [[C]](s64)
+  ; CHECK-NEXT:   [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR [[EVEC]](s1)
+  ; CHECK-NEXT:   $v0 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s1>)
+  ; CHECK-NEXT:   PseudoRET implicit $v0
+  %a = shufflevector <vscale x 1 x i1> %x, <vscale x 1 x i1> poison, <vscale x 1 x i32> poison
+  ret <vscale x 1 x i1> %a
+}

From 182708680bbe34b579a09b2dbc3215b519b2473f Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Thu, 29 Aug 2024 14:25:34 -0700
Subject: [PATCH 56/72] [SandboxIR] Add ExtractValueInst. (#106613)

---
 llvm/include/llvm/SandboxIR/SandboxIR.h       |  66 ++++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |   1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              |  27 +++++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 100 ++++++++++++++++++
 4 files changed, 194 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 14210ae35c0082..d4c2efca4ecfa0 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -74,6 +74,8 @@
 //                                      |
 //                                      +- ShuffleVectorInst
 //                                      |
+//                                      +- ExtractValueInst
+//                                      |
 //                                      +- InsertValueInst
 //                                      |
 //                                      +- StoreInst
@@ -120,6 +122,7 @@ class SelectInst;
 class ExtractElementInst;
 class InsertElementInst;
 class ShuffleVectorInst;
+class ExtractValueInst;
 class InsertValueInst;
 class BranchInst;
 class UnaryInstruction;
@@ -270,6 +273,7 @@ class Value {
   friend class ExtractElementInst;    // For getting `Val`.
   friend class InsertElementInst;     // For getting `Val`.
   friend class ShuffleVectorInst;     // For getting `Val`.
+  friend class ExtractValueInst;      // For getting `Val`.
   friend class InsertValueInst;       // For getting `Val`.
   friend class BranchInst;            // For getting `Val`.
   friend class LoadInst;              // For getting `Val`.
@@ -710,6 +714,7 @@ class Instruction : public sandboxir::User {
   friend class ExtractElementInst; // For getTopmostLLVMInstruction().
   friend class InsertElementInst;  // For getTopmostLLVMInstruction().
   friend class ShuffleVectorInst;  // For getTopmostLLVMInstruction().
+  friend class ExtractValueInst;   // For getTopmostLLVMInstruction().
   friend class InsertValueInst;    // For getTopmostLLVMInstruction().
   friend class BranchInst;         // For getTopmostLLVMInstruction().
   friend class LoadInst;           // For getTopmostLLVMInstruction().
@@ -1621,6 +1626,65 @@ class UnaryInstruction
   }
 };
 
+class ExtractValueInst : public UnaryInstruction {
+  /// Use Context::createExtractValueInst() instead.
+  ExtractValueInst(llvm::ExtractValueInst *EVI, Context &Ctx)
+      : UnaryInstruction(ClassID::ExtractValue, Opcode::ExtractValue, EVI,
+                         Ctx) {}
+  friend Context; // for ExtractValueInst()
+
+public:
+  static Value *create(Value *Agg, ArrayRef<unsigned> Idxs, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ExtractValue;
+  }
+
+  /// Returns the type of the element that would be extracted
+  /// with an extractvalue instruction with the specified parameters.
+  ///
+  /// Null is returned if the indices are invalid for the specified type.
+  static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs) {
+    return llvm::ExtractValueInst::getIndexedType(Agg, Idxs);
+  }
+
+  using idx_iterator = llvm::ExtractValueInst::idx_iterator;
+
+  inline idx_iterator idx_begin() const {
+    return cast<llvm::ExtractValueInst>(Val)->idx_begin();
+  }
+  inline idx_iterator idx_end() const {
+    return cast<llvm::ExtractValueInst>(Val)->idx_end();
+  }
+  inline iterator_range<idx_iterator> indices() const {
+    return cast<llvm::ExtractValueInst>(Val)->indices();
+  }
+
+  Value *getAggregateOperand() {
+    return getOperand(getAggregateOperandIndex());
+  }
+  const Value *getAggregateOperand() const {
+    return getOperand(getAggregateOperandIndex());
+  }
+  static unsigned getAggregateOperandIndex() {
+    return llvm::ExtractValueInst::getAggregateOperandIndex();
+  }
+
+  ArrayRef<unsigned> getIndices() const {
+    return cast<llvm::ExtractValueInst>(Val)->getIndices();
+  }
+
+  unsigned getNumIndices() const {
+    return cast<llvm::ExtractValueInst>(Val)->getNumIndices();
+  }
+
+  unsigned hasIndices() const {
+    return cast<llvm::ExtractValueInst>(Val)->hasIndices();
+  }
+};
+
 class VAArgInst : public UnaryInstruction {
   VAArgInst(llvm::VAArgInst *FI, Context &Ctx)
       : UnaryInstruction(ClassID::VAArg, Opcode::VAArg, FI, Ctx) {}
@@ -3123,6 +3187,8 @@ class Context {
   friend ExtractElementInst; // For createExtractElementInst()
   ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
   friend ShuffleVectorInst; // For createShuffleVectorInst()
+  ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI);
+  friend ExtractValueInst; // For createExtractValueInst()
   InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
   friend InsertValueInst; // For createInsertValueInst()
   BranchInst *createBranchInst(llvm::BranchInst *I);
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 49b03ad6760d37..d29fc3b5e95871 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -47,6 +47,7 @@ DEF_INSTR(VAArg,          OP(VAArg),          VAArgInst)
 DEF_INSTR(Freeze,         OP(Freeze),         FreezeInst)
 DEF_INSTR(Fence,          OP(Fence),          FenceInst)
 DEF_INSTR(ShuffleVector,  OP(ShuffleVector),  ShuffleVectorInst)
+DEF_INSTR(ExtractValue,   OP(ExtractValue),   ExtractValueInst)
 DEF_INSTR(InsertValue,    OP(InsertValue),    InsertValueInst)
 DEF_INSTR(Select,         OP(Select),         SelectInst)
 DEF_INSTR(Br,             OP(Br),             BranchInst)
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 86198b7b83d0e2..b5786cdafd6307 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2145,6 +2145,21 @@ Constant *ShuffleVectorInst::convertShuffleMaskForBitcode(
       llvm::ShuffleVectorInst::convertShuffleMaskForBitcode(Mask, ResultTy));
 }
 
+Value *ExtractValueInst::create(Value *Agg, ArrayRef<unsigned> Idxs,
+                                BBIterator WhereIt, BasicBlock *WhereBB,
+                                Context &Ctx, const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  llvm::Value *NewV = Builder.CreateExtractValue(Agg->Val, Idxs, Name);
+  if (auto *NewExtractValueInst = dyn_cast<llvm::ExtractValueInst>(NewV))
+    return Ctx.createExtractValueInst(NewExtractValueInst);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
 Value *InsertValueInst::create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
                                BBIterator WhereIt, BasicBlock *WhereBB,
                                Context &Ctx, const Twine &Name) {
@@ -2320,6 +2335,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
         new ShuffleVectorInst(LLVMIns, *this));
     return It->second.get();
   }
+  case llvm::Instruction::ExtractValue: {
+    auto *LLVMIns = cast<llvm::ExtractValueInst>(LLVMV);
+    It->second =
+        std::unique_ptr<ExtractValueInst>(new ExtractValueInst(LLVMIns, *this));
+    return It->second.get();
+  }
   case llvm::Instruction::InsertValue: {
     auto *LLVMIns = cast<llvm::InsertValueInst>(LLVMV);
     It->second =
@@ -2548,6 +2569,12 @@ Context::createShuffleVectorInst(llvm::ShuffleVectorInst *SVI) {
   return cast<ShuffleVectorInst>(registerValue(std::move(NewPtr)));
 }
 
+ExtractValueInst *Context::createExtractValueInst(llvm::ExtractValueInst *EVI) {
+  auto NewPtr =
+      std::unique_ptr<ExtractValueInst>(new ExtractValueInst(EVI, *this));
+  return cast<ExtractValueInst>(registerValue(std::move(NewPtr)));
+}
+
 InsertValueInst *Context::createInsertValueInst(llvm::InsertValueInst *IVI) {
   auto NewPtr =
       std::unique_ptr<InsertValueInst>(new InsertValueInst(IVI, *this));
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 31519074e1b908..8bf4b24c48ee0d 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1261,6 +1261,106 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) {
   }
 }
 
+TEST_F(SandboxIRTest, ExtractValueInst) {
+  parseIR(C, R"IR(
+define void @foo({i32, float} %agg) {
+  %ext_simple = extractvalue {i32, float} %agg, 0
+  %ext_nested = extractvalue {float, {i32}} undef, 1, 0
+  %const1 = extractvalue {i32, float} {i32 0, float 99.0}, 0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *ArgAgg = F.getArg(0);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *ExtSimple = cast<sandboxir::ExtractValueInst>(&*It++);
+  auto *ExtNested = cast<sandboxir::ExtractValueInst>(&*It++);
+  auto *Const1 = cast<sandboxir::ExtractValueInst>(&*It++);
+  auto *Ret = &*It++;
+
+  EXPECT_EQ(ExtSimple->getOperand(0), ArgAgg);
+
+  // create before instruction
+  auto *NewExtBeforeRet =
+      cast<sandboxir::ExtractValueInst>(sandboxir::ExtractValueInst::create(
+          ArgAgg, ArrayRef<unsigned>({0}), Ret->getIterator(), Ret->getParent(),
+          Ctx, "NewExtBeforeRet"));
+  EXPECT_EQ(NewExtBeforeRet->getNextNode(), Ret);
+#ifndef NDEBUG
+  EXPECT_EQ(NewExtBeforeRet->getName(), "NewExtBeforeRet");
+#endif // NDEBUG
+
+  // create at end of BB
+  auto *NewExtAtEnd =
+      cast<sandboxir::ExtractValueInst>(sandboxir::ExtractValueInst::create(
+          ArgAgg, ArrayRef<unsigned>({0}), BB->end(), BB, Ctx, "NewExtAtEnd"));
+  EXPECT_EQ(NewExtAtEnd->getPrevNode(), Ret);
+#ifndef NDEBUG
+  EXPECT_EQ(NewExtAtEnd->getName(), "NewExtAtEnd");
+#endif // NDEBUG
+
+  // Test the path that creates a folded constant.
+  auto *ShouldBeConstant = sandboxir::ExtractValueInst::create(
+      Const1->getOperand(0), ArrayRef<unsigned>({0}), BB->end(), BB, Ctx);
+  EXPECT_TRUE(isa<sandboxir::Constant>(ShouldBeConstant));
+
+  auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx);
+  EXPECT_EQ(ShouldBeConstant, Zero);
+
+  // getIndexedType
+  Type *AggType = ExtNested->getAggregateOperand()->getType();
+  EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType(
+                AggType, ArrayRef<unsigned>({1, 0})),
+            llvm::ExtractValueInst::getIndexedType(AggType,
+                                                   ArrayRef<unsigned>({1, 0})));
+
+  EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType(
+                AggType, ArrayRef<unsigned>({2})),
+            nullptr);
+
+  // idx_begin / idx_end
+  {
+    SmallVector<int, 2> IndicesSimple(ExtSimple->idx_begin(),
+                                      ExtSimple->idx_end());
+    EXPECT_THAT(IndicesSimple, testing::ElementsAre(0u));
+
+    SmallVector<int, 2> IndicesNested(ExtNested->idx_begin(),
+                                      ExtNested->idx_end());
+    EXPECT_THAT(IndicesNested, testing::ElementsAre(1u, 0u));
+  }
+
+  // indices
+  {
+    SmallVector<int, 2> IndicesSimple(ExtSimple->indices());
+    EXPECT_THAT(IndicesSimple, testing::ElementsAre(0u));
+
+    SmallVector<int, 2> IndicesNested(ExtNested->indices());
+    EXPECT_THAT(IndicesNested, testing::ElementsAre(1u, 0u));
+  }
+
+  // getAggregateOperand
+  EXPECT_EQ(ExtSimple->getAggregateOperand(), ArgAgg);
+  const auto *ConstExtSimple = ExtSimple;
+  EXPECT_EQ(ConstExtSimple->getAggregateOperand(), ArgAgg);
+
+  // getAggregateOperandIndex
+  EXPECT_EQ(sandboxir::ExtractValueInst::getAggregateOperandIndex(),
+            llvm::ExtractValueInst::getAggregateOperandIndex());
+
+  // getIndices
+  EXPECT_EQ(ExtSimple->getIndices().size(), 1u);
+  EXPECT_EQ(ExtSimple->getIndices()[0], 0u);
+
+  // getNumIndices
+  EXPECT_EQ(ExtSimple->getNumIndices(), 1u);
+
+  // hasIndices
+  EXPECT_EQ(ExtSimple->hasIndices(), true);
+}
+
 TEST_F(SandboxIRTest, InsertValueInst) {
   parseIR(C, R"IR(
 define void @foo({i32, float} %agg, i32 %i) {

From 412e3e394dbd1b7d8655639e161ed4dbd5505c96 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 29 Aug 2024 22:09:19 +0100
Subject: [PATCH 57/72] [ExtendLifetimes][NFC] Add explicit triple to remaining
 fake-use tests

One of the tests for the new fake use intrinsic are failing on darwin
buildbots due to relying on behaviour for their expected triple; this
commit adds explicit triples to the few remaining fake-use tests that
didn't have them.

Fixes commit 3d08ade (#86149).

Buildbot failures:
https://lab.llvm.org/buildbot/#/builders/23/builds/2505
---
 llvm/test/CodeGen/X86/fake-use-hpfloat.ll | 2 +-
 llvm/test/CodeGen/X86/fake-use-vector.ll  | 2 +-
 llvm/test/DebugInfo/X86/fake-use.ll       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll
index 7a95c38801837c..fd511a6179acfe 100644
--- a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll
+++ b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll
@@ -1,6 +1,6 @@
 ; assert in DAGlegalizer with fake use of half precision float.
 ; Changes to half float promotion.
-; RUN: llc -stop-after=finalize-isel -o - %s | FileCheck %s
+; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s
 ;
 ; CHECK:      bb.0.entry:
 ; CHECK-NEXT: %0:fr16 = FsFLD0SH
diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll
index cb46ccc8cac11c..4d6ede30827046 100644
--- a/llvm/test/CodeGen/X86/fake-use-vector.ll
+++ b/llvm/test/CodeGen/X86/fake-use-vector.ll
@@ -1,5 +1,5 @@
 ; assert in DAGlegalizer with fake use of 1-element vectors.
-; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s
+; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s
 ;
 ; ModuleID = 't2.cpp'
 ; source_filename = "t2.cpp"
diff --git a/llvm/test/DebugInfo/X86/fake-use.ll b/llvm/test/DebugInfo/X86/fake-use.ll
index f44aadfeef5640..5ac5104a167118 100644
--- a/llvm/test/DebugInfo/X86/fake-use.ll
+++ b/llvm/test/DebugInfo/X86/fake-use.ll
@@ -3,9 +3,9 @@
 ; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b'
 ; to extend all the way to the end of the function.
 
-; RUN: %llc_dwarf -O2 -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: %llc_dwarf -O2 -filetype=obj -mtriple=x86_64-unknown-linux -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t
 ; RUN: %python %p/../Inputs/check-fake-use.py %t
-; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t
+; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -mtriple=x86_64-unknown-linux -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t
 ; RUN: not %python %p/../Inputs/check-fake-use.py %t
 
 ; Generated with:

From 7284e0f3a4f8924a0f69f654db8c4b4d00d232cb Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 29 Aug 2024 18:53:03 -0300
Subject: [PATCH 58/72] [clang] mangle placeholder for deduced type as a
 template-prefix (#106335)

As agreed on https://github.com/itanium-cxx-abi/cxx-abi/issues/109 these
placeholders should be mangled as a `template-prefix` production.

```
    <template-prefix> ::= <template unqualified-name>           # global template
                      ::= <prefix> <template unqualified-name>  # nested template
                      ::= <template-param>                      # template template parameter
                      ::= <substitution>
```

Previous to this patch, the template template parameter case was not
handled, and template template parameters were incorrectly being handled
as unqualified-names.

Before #95202, DeducedTemplateType was not canonicalized correctly, so
that template template parameter declarations were retained
uncanonicalized.

After #95202, they are correctly canonicalized, but this now leads to
these TTPs being anonymous entities, where the mangling implementation
correctly doesn't expect an anonymous declaration of this kind, leading
to a crash.

Fixes #106182.
---
 clang/docs/ReleaseNotes.rst        |  2 ++
 clang/lib/AST/ItaniumMangle.cpp    | 12 ++++--------
 clang/test/CodeGenCXX/GH106182.cpp | 12 ++++++++++++
 clang/test/SemaCXX/GH106182.cpp    | 16 ++++++++++++++++
 4 files changed, 34 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/GH106182.cpp
 create mode 100644 clang/test/SemaCXX/GH106182.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 27f3d6e05da9f5..d3aebdb0b06477 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -326,6 +326,8 @@ Bug Fixes to C++ Support
 - Fix evaluation of the index of dependent pack indexing expressions/types specifiers (#GH105900)
 - Correctly handle subexpressions of an immediate invocation in the presence of implicit casts. (#GH105558)
 - Clang now correctly handles direct-list-initialization of a structured bindings from an array. (#GH31813)
+- Mangle placeholders for deduced types as a template-prefix, such that mangling
+  of template template parameters uses the correct production. (#GH106182)
 - Fixed an assertion failure when converting vectors to int/float with invalid expressions. (#GH105486)
 
 Bug Fixes to AST Handling
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 976670d1efa561..1ce51f65dabd75 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4442,14 +4442,10 @@ void CXXNameMangler::mangleType(const DeducedTemplateSpecializationType *T) {
   if (!Deduced.isNull())
     return mangleType(Deduced);
 
-  TemplateDecl *TD = T->getTemplateName().getAsTemplateDecl();
-  assert(TD && "shouldn't form deduced TST unless we know we have a template");
-
-  if (mangleSubstitution(TD))
-    return;
-
-  mangleName(GlobalDecl(TD));
-  addSubstitution(TD);
+  TemplateName TN = T->getTemplateName();
+  assert(TN.getAsTemplateDecl() &&
+         "shouldn't form deduced TST unless we know we have a template");
+  mangleType(TN);
 }
 
 void CXXNameMangler::mangleType(const AtomicType *T) {
diff --git a/clang/test/CodeGenCXX/GH106182.cpp b/clang/test/CodeGenCXX/GH106182.cpp
new file mode 100644
index 00000000000000..401dadfd6de8b7
--- /dev/null
+++ b/clang/test/CodeGenCXX/GH106182.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++20 %s -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %s
+
+template <template <class> class S>
+void create_unique()
+  requires (S{0}, true) {}
+
+template <class Fn> struct A {
+  constexpr A(Fn) {};
+};
+
+template void create_unique<A>();
+// CHECK: @_Z13create_uniqueI1AEvvQcmtlT_Li0EELb1E(
diff --git a/clang/test/SemaCXX/GH106182.cpp b/clang/test/SemaCXX/GH106182.cpp
new file mode 100644
index 00000000000000..f82064bff27e9e
--- /dev/null
+++ b/clang/test/SemaCXX/GH106182.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template <class Fn> struct A {
+  constexpr A(Fn) {};
+};
+
+template <template <class> class S>
+ void create_unique()
+   requires (S{0}, true);
+
+template <template <class> class S>
+ void create_unique()
+   requires (S{0}, true) {}
+
+template void create_unique<A>();

From 4caf0196c042601b1c442a5726a157fead00ecc7 Mon Sep 17 00:00:00 2001
From: Connie Zhu <60797237+connieyzhu@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:54:40 -0700
Subject: [PATCH 59/72] [clang][test] Rewrote test using command substitution
 to work with lit internal shell syntax (#105902)

This patch rewrites a test that uses command substitution `$()` and the
`stat` command, which are not supported by lit's internal shell. Instead
of using this syntax to perform the file size comparison done in this
test, a Python script is used instead to perform the same operation.

Fixes https://github.com/llvm/llvm-project/issues/102384.
---
 clang/test/Modules/compare-file-size.py  | 20 ++++++++++++++++++++
 clang/test/Modules/reduced-bmi-size.cppm |  3 +--
 2 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/compare-file-size.py

diff --git a/clang/test/Modules/compare-file-size.py b/clang/test/Modules/compare-file-size.py
new file mode 100644
index 00000000000000..8988d91cbcc2c3
--- /dev/null
+++ b/clang/test/Modules/compare-file-size.py
@@ -0,0 +1,20 @@
+# This program takes in two file path arguments in the form 'compare-file-size.py file1 file2'
+# Returns true if the file size of the file1 is smaller than the file size of file2
+
+import argparse
+import os
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("file1", type=str)
+    parser.add_argument("file2", type=str)
+
+    args = parser.parse_args()
+
+    return os.path.getsize(args.file1) < os.path.getsize(args.file2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/clang/test/Modules/reduced-bmi-size.cppm b/clang/test/Modules/reduced-bmi-size.cppm
index 664f45f5c6a5a7..6e3323266561e8 100644
--- a/clang/test/Modules/reduced-bmi-size.cppm
+++ b/clang/test/Modules/reduced-bmi-size.cppm
@@ -10,7 +10,6 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %s -o %t/a.reduced.pcm
 //
-// %s implies the current source file. So we can't use it directly.
-// RUN: [ $(stat -c%\s "%t/a.pcm") -le $(stat -c%\s "%t/a.reduced.pcm") ]
+// RUN: %python %S/compare-file-size.py %t/a.pcm %t/a.reduced.pcm
 
 export module a;

From aa91d90cb07d72b32176a966fe798ab71ecb0a76 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Aug 2024 15:00:23 -0700
Subject: [PATCH 60/72] [LegalizeVectorOps][PowerPC] Use xor to expand fneg.
 (#106595)

This preserves the semantis of fneg and matches what we do in
LegalizeDAG.

I kept the legal FSUB check to force unrolling for some targets that
don't have FSUB but have XOR. On Aarch64, using xor broke some tests that
expected to see a (v1f64 (fma (insertvector_elt (f64 (fneg
(extractvectorelt X)))))) pattern.
---
 .../CodeGen/SelectionDAG/LegalizeVectorOps.cpp   | 16 +++++++++++-----
 llvm/test/CodeGen/PowerPC/fma-negate.ll          |  2 +-
 llvm/test/CodeGen/PowerPC/fp-strict.ll           |  8 ++++----
 llvm/test/CodeGen/PowerPC/vec_abs.ll             |  2 +-
 llvm/test/CodeGen/PowerPC/vec_fneg.ll            |  2 +-
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 3f104baed97b1a..2557fa288606e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1669,12 +1669,18 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
 }
 
 SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
-  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Node->getValueType(0))) {
+  EVT VT = Node->getValueType(0);
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+
+  // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64.
+  if (TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) {
     SDLoc DL(Node);
-    SDValue Zero = DAG.getConstantFP(-0.0, DL, Node->getValueType(0));
-    // TODO: If FNEG had fast-math-flags, they'd get propagated to this FSUB.
-    return DAG.getNode(ISD::FSUB, DL, Node->getValueType(0), Zero,
-                       Node->getOperand(0));
+    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+    SDValue SignMask = DAG.getConstant(
+        APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT);
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, IntVT, Cast, SignMask);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Xor);
   }
   return DAG.UnrollVectorOp(Node);
 }
diff --git a/llvm/test/CodeGen/PowerPC/fma-negate.ll b/llvm/test/CodeGen/PowerPC/fma-negate.ll
index 22118c44ece706..1f8e0968ca98ea 100644
--- a/llvm/test/CodeGen/PowerPC/fma-negate.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-negate.ll
@@ -155,7 +155,7 @@ define <4 x float> @test_neg_fma_v4f32(<4 x float> %a, <4 x float> %b,
 ; NO-VSX:       # %bb.0: # %entry
 ; NO-VSX-NEXT:    vspltisb 5, -1
 ; NO-VSX-NEXT:    vslw 5, 5, 5
-; NO-VSX-NEXT:    vsubfp 2, 5, 2
+; NO-VSX-NEXT:    vxor 2, 2, 5
 ; NO-VSX-NEXT:    vmaddfp 2, 2, 3, 4
 ; NO-VSX-NEXT:    blr
                                        <4 x float> %c) {
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict.ll b/llvm/test/CodeGen/PowerPC/fp-strict.ll
index 124c588ba242c1..d3025f1da658af 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict.ll
@@ -915,7 +915,7 @@ define <4 x float> @fmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
 ; NOVSX-NEXT:    vslw v5, v5, v5
 ; NOVSX-NEXT:    stvx v3, 0, r3
 ; NOVSX-NEXT:    addi r3, r1, -64
-; NOVSX-NEXT:    vsubfp v4, v5, v4
+; NOVSX-NEXT:    vxor v4, v4, v5
 ; NOVSX-NEXT:    stvx v2, 0, r3
 ; NOVSX-NEXT:    addi r3, r1, -32
 ; NOVSX-NEXT:    stvx v4, 0, r3
@@ -1213,7 +1213,7 @@ define <4 x float> @fnmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
 ; NOVSX-NEXT:    fmadds f0, f2, f1, f0
 ; NOVSX-NEXT:    stfs f0, -16(r1)
 ; NOVSX-NEXT:    lvx v2, 0, r3
-; NOVSX-NEXT:    vsubfp v2, v3, v2
+; NOVSX-NEXT:    vxor v2, v2, v3
 ; NOVSX-NEXT:    blr
 ;
 ; SPE-LABEL: fnmadd_v4f32:
@@ -1462,7 +1462,7 @@ define <4 x float> @fnmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
 ; NOVSX-NEXT:    vslw v5, v5, v5
 ; NOVSX-NEXT:    stvx v3, 0, r3
 ; NOVSX-NEXT:    addi r3, r1, -64
-; NOVSX-NEXT:    vsubfp v4, v5, v4
+; NOVSX-NEXT:    vxor v4, v4, v5
 ; NOVSX-NEXT:    stvx v2, 0, r3
 ; NOVSX-NEXT:    addi r3, r1, -32
 ; NOVSX-NEXT:    stvx v4, 0, r3
@@ -1488,7 +1488,7 @@ define <4 x float> @fnmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
 ; NOVSX-NEXT:    fmadds f0, f1, f0, f2
 ; NOVSX-NEXT:    stfs f0, -16(r1)
 ; NOVSX-NEXT:    lvx v2, 0, r3
-; NOVSX-NEXT:    vsubfp v2, v5, v2
+; NOVSX-NEXT:    vxor v2, v2, v5
 ; NOVSX-NEXT:    blr
 ;
 ; SPE-LABEL: fnmsub_v4f32:
diff --git a/llvm/test/CodeGen/PowerPC/vec_abs.ll b/llvm/test/CodeGen/PowerPC/vec_abs.ll
index f7ff18f3ce1790..50dcfc3faf62e9 100644
--- a/llvm/test/CodeGen/PowerPC/vec_abs.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_abs.ll
@@ -44,7 +44,7 @@ define <4 x float> @test2_float(<4 x float> %aa) #0 {
 ; CHECK-NOVSX: fabs
 ; CHECK-NOVSX: fabs
 ; CHECK-NOVSX: fabs
-; CHECK-NOVSX: vsubfp
+; CHECK-NOVSX: vxor
 ; CHECK-NOVSX: blr
 
 define <2 x double> @test_double(<2 x double> %aa) #0 {
diff --git a/llvm/test/CodeGen/PowerPC/vec_fneg.ll b/llvm/test/CodeGen/PowerPC/vec_fneg.ll
index 2854a31cad9e17..bbbdd45cbb01ac 100644
--- a/llvm/test/CodeGen/PowerPC/vec_fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_fneg.ll
@@ -15,7 +15,7 @@ define void @test_float(ptr %A) {
 
 ; CHECK: xvnegsp
 ; CHECK: blr
-; CHECK-NOVSX: vsubfp
+; CHECK-NOVSX: vxor
 ; CHECK-NOVSX: blr
 
 }

From e51fc36c385d756ccbc2be8c47c52af207c3aead Mon Sep 17 00:00:00 2001
From: rjmansfield <rjmansfield@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:06:33 -0400
Subject: [PATCH 61/72] [lsan][test] Allow testcase to execute on remote
 targets without not utility (#87350)

---
 compiler-rt/test/lsan/TestCases/create_thread_leak.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/lsan/TestCases/create_thread_leak.cpp b/compiler-rt/test/lsan/TestCases/create_thread_leak.cpp
index 564d4b66384e8a..994b0e644632eb 100644
--- a/compiler-rt/test/lsan/TestCases/create_thread_leak.cpp
+++ b/compiler-rt/test/lsan/TestCases/create_thread_leak.cpp
@@ -1,9 +1,9 @@
 // Test handling of arg and retval of child thread.
 
 // RUN: %clangxx_lsan -pthread %s -o %t
-// RUN: %run not %t 10 1 0 0 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK123
-// RUN: %run not %t 10 0 1 0 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK234
-// RUN: %run not %t 10 0 0 1 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK234
+// RUN: not %run %t 10 1 0 0 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK123
+// RUN: not %run %t 10 0 1 0 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK234
+// RUN: not %run %t 10 0 0 1 2>&1 | FileCheck %s --check-prefixes=LEAK,LEAK234
 // RUN: %run %t 10 0 0 0
 
 // This test appears to be flaky on x86_64-darwin buildbots.

From cc943a67d114e28c28f561c3b1a48ff2003264ce Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 29 Aug 2024 15:05:09 -0700
Subject: [PATCH 62/72] [SLP]Fix PR106626: trye several attempts for lookup
 values, if not found.

If the value is used in Scalar several times, the first attempt to find
its position in the node (if ReuseShuffleIndices and ReorderIndices not
empty) may fail. In this case need to find another copy of the same
value and try again.
Fixes https://github.com/llvm/llvm-project/issues/106626
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  26 +-
 .../AArch64/reused-scalar-repeated-in-node.ll | 231 ++++++++++++++++++
 2 files changed, 249 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 775fa9ba75cfb7..edb2567fa057b3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3237,15 +3237,25 @@ class BoUpSLP {
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     int findLaneForValue(Value *V) const {
-      unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
-      assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
-      if (!ReorderIndices.empty())
-        FoundLane = ReorderIndices[FoundLane];
-      assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
-      if (!ReuseShuffleIndices.empty()) {
-        FoundLane = std::distance(ReuseShuffleIndices.begin(),
-                                  find(ReuseShuffleIndices, FoundLane));
+      unsigned FoundLane = getVectorFactor();
+      for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
+           std::advance(It, 1)) {
+        if (*It != V)
+          continue;
+        FoundLane = std::distance(Scalars.begin(), It);
+        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
+        if (!ReorderIndices.empty())
+          FoundLane = ReorderIndices[FoundLane];
+        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
+        if (ReuseShuffleIndices.empty())
+          break;
+        if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
+            RIt != ReuseShuffleIndices.end()) {
+          FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
+          break;
+        }
       }
+      assert(FoundLane < getVectorFactor() && "Unable to find given value.");
       return FoundLane;
     }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
new file mode 100644
index 00000000000000..dbc4f3d59d4f9b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BB61:.*]]
+; CHECK:       [[BB61]]:
+; CHECK-NEXT:    br label %[[BB64:.*]]
+; CHECK:       [[BB62:.*]]:
+; CHECK-NEXT:    br i1 poison, label %[[BB63:.*]], label %[[BB64]]
+; CHECK:       [[BB63]]:
+; CHECK-NEXT:    br label %[[BB64]]
+; CHECK:       [[BB64]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
+; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
+; CHECK-NEXT:    [[I69:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I70:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I71:%.*]] = load float, ptr poison, align 16
+; CHECK-NEXT:    [[I72:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I73:%.*]] = load float, ptr poison, align 8
+; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
+; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I69]], i32 14
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I67]], i32 15
+; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
+; CHECK:       [[BB77]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 5, i32 6, i32 7, i32 15, i32 15, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 1, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    br label %[[BB78:.*]]
+; CHECK:       [[BB78]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x float> [ poison, %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 7, i32 2, i32 3, i32 0, i32 6, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP17]], [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 7, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 22, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP25:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP14]], <2 x float> [[TMP0]], i64 2)
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
+; CHECK-NEXT:    [[TMP30]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 15, i32 15, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <2 x i32> <i32 10, i32 11>
+; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
+; CHECK:       [[BB167]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP32]], i32 15
+; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP32]], i32 13
+; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP32]], i32 14
+; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
+; CHECK:       [[BB184]]:
+; CHECK-NEXT:    br label %[[BB185:.*]]
+; CHECK:       [[BB185]]:
+; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
+; CHECK:       [[BB186]]:
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP35]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb61
+
+bb61:
+  br label %bb64
+
+bb62:
+  br i1 poison, label %bb63, label %bb64
+
+bb63:
+  br label %bb64
+
+bb64:
+  %i = phi nsz float [ poison, %bb61 ], [ poison, %bb63 ], [ poison, %bb62 ]
+  %i65 = phi nsz float [ poison, %bb61 ], [ poison, %bb63 ], [ poison, %bb62 ]
+  %i66 = load float, ptr poison, align 16
+  %i67 = load float, ptr poison, align 4
+  %i68 = load float, ptr poison, align 8
+  %i69 = load float, ptr poison, align 4
+  %i70 = load float, ptr poison, align 4
+  %i71 = load float, ptr poison, align 16
+  %i72 = load float, ptr poison, align 4
+  %i73 = load float, ptr poison, align 8
+  %i74 = load float, ptr poison, align 4
+  %i75 = load float, ptr poison, align 16
+  %i76 = load float, ptr poison, align 4
+  br i1 poison, label %bb167, label %bb77
+
+bb77:
+  br label %bb78
+
+bb78:
+  %i79 = phi nsz float [ %i66, %bb77 ], [ %i103, %bb78 ]
+  %i80 = phi nsz float [ %i67, %bb77 ], [ %i104, %bb78 ]
+  %i81 = phi nsz float [ %i68, %bb77 ], [ %i105, %bb78 ]
+  %i82 = phi nsz float [ poison, %bb77 ], [ %i106, %bb78 ]
+  %i83 = phi nsz float [ poison, %bb77 ], [ %i123, %bb78 ]
+  %i84 = phi nsz float [ %i69, %bb77 ], [ %i124, %bb78 ]
+  %i85 = phi nsz float [ poison, %bb77 ], [ %i125, %bb78 ]
+  %i86 = phi nsz float [ %i70, %bb77 ], [ %i126, %bb78 ]
+  %i87 = fmul fast float %i79, poison
+  %i88 = fmul fast float %i80, poison
+  %i89 = fmul fast float %i81, poison
+  %i90 = fmul fast float %i82, poison
+  %i91 = fmul fast float %i83, poison
+  %i92 = fadd fast float %i91, %i87
+  %i93 = fmul fast float %i84, poison
+  %i94 = fadd fast float %i93, %i88
+  %i95 = fmul fast float %i85, poison
+  %i96 = fadd fast float %i95, %i89
+  %i97 = fmul fast float %i86, poison
+  %i98 = fadd fast float %i97, %i90
+  %i99 = fadd fast float %i92, poison
+  %i100 = fadd fast float %i94, poison
+  %i101 = fadd fast float %i96, poison
+  %i102 = fadd fast float %i98, poison
+  %i103 = fadd fast float %i99, poison
+  %i104 = fadd fast float %i100, poison
+  %i105 = fadd fast float %i101, poison
+  %i106 = fadd fast float %i102, poison
+  %i107 = fmul fast float %i79, poison
+  %i108 = fmul fast float %i80, poison
+  %i109 = fmul fast float %i81, poison
+  %i110 = fmul fast float %i82, poison
+  %i111 = fmul fast float %i83, poison
+  %i112 = fadd fast float %i111, %i107
+  %i113 = fmul fast float %i84, poison
+  %i114 = fadd fast float %i113, %i108
+  %i115 = fmul fast float %i85, poison
+  %i116 = fadd fast float %i115, %i109
+  %i117 = fmul fast float %i86, poison
+  %i118 = fadd fast float %i117, %i110
+  %i119 = fadd fast float %i112, poison
+  %i120 = fadd fast float %i114, poison
+  %i121 = fadd fast float %i116, poison
+  %i122 = fadd fast float %i118, poison
+  %i123 = fadd fast float %i119, poison
+  %i124 = fadd fast float %i120, poison
+  %i125 = fadd fast float %i121, poison
+  %i126 = fadd fast float %i122, poison
+  %i127 = fmul fast float %i79, %i
+  %i128 = fmul fast float %i80, %i
+  %i129 = fmul fast float %i81, %i
+  %i130 = fmul fast float %i82, %i
+  %i131 = fmul fast float %i83, %i65
+  %i132 = fadd fast float %i131, %i127
+  %i133 = fmul fast float %i84, %i65
+  %i134 = fadd fast float %i133, %i128
+  %i135 = fmul fast float %i85, %i65
+  %i136 = fadd fast float %i135, %i129
+  %i137 = fmul fast float %i86, %i65
+  %i138 = fadd fast float %i137, %i130
+  %i139 = fadd fast float %i132, poison
+  %i140 = fadd fast float %i134, poison
+  %i141 = fadd fast float %i136, poison
+  %i142 = fadd fast float %i138, poison
+  %i143 = fadd fast float %i139, poison
+  %i144 = fadd fast float %i140, poison
+  %i145 = fadd fast float %i141, poison
+  %i146 = fadd fast float %i142, poison
+  %i147 = fmul fast float %i79, poison
+  %i148 = fmul fast float %i80, poison
+  %i149 = fmul fast float %i81, poison
+  %i150 = fmul fast float %i82, poison
+  %i151 = fmul fast float %i83, poison
+  %i152 = fadd fast float %i151, %i147
+  %i153 = fmul fast float %i84, poison
+  %i154 = fadd fast float %i153, %i148
+  %i155 = fmul fast float %i85, poison
+  %i156 = fadd fast float %i155, %i149
+  %i157 = fmul fast float %i86, poison
+  %i158 = fadd fast float %i157, %i150
+  %i159 = fadd fast float %i152, poison
+  %i160 = fadd fast float %i154, poison
+  %i161 = fadd fast float %i156, poison
+  %i162 = fadd fast float %i158, poison
+  %i163 = fadd fast float %i159, poison
+  %i164 = fadd fast float %i160, poison
+  %i165 = fadd fast float %i161, poison
+  %i166 = fadd fast float %i162, poison
+  br i1 poison, label %bb78, label %bb167
+
+bb167:
+  %i168 = phi nsz float [ %i76, %bb64 ], [ %i166, %bb78 ]
+  %i169 = phi nsz float [ poison, %bb64 ], [ %i165, %bb78 ]
+  %i170 = phi nsz float [ poison, %bb64 ], [ %i164, %bb78 ]
+  %i171 = phi nsz float [ %i75, %bb64 ], [ %i163, %bb78 ]
+  %i172 = phi nsz float [ %i74, %bb64 ], [ %i146, %bb78 ]
+  %i173 = phi nsz float [ %i73, %bb64 ], [ %i145, %bb78 ]
+  %i174 = phi nsz float [ %i72, %bb64 ], [ %i144, %bb78 ]
+  %i175 = phi nsz float [ %i71, %bb64 ], [ %i143, %bb78 ]
+  %i176 = phi nsz float [ %i70, %bb64 ], [ %i126, %bb78 ]
+  %i177 = phi nsz float [ poison, %bb64 ], [ %i125, %bb78 ]
+  %i178 = phi nsz float [ %i69, %bb64 ], [ %i124, %bb78 ]
+  %i179 = phi nsz float [ poison, %bb64 ], [ %i123, %bb78 ]
+  %i180 = phi nsz float [ poison, %bb64 ], [ %i106, %bb78 ]
+  %i181 = phi nsz float [ %i68, %bb64 ], [ %i105, %bb78 ]
+  %i182 = phi nsz float [ %i67, %bb64 ], [ %i104, %bb78 ]
+  %i183 = phi nsz float [ %i66, %bb64 ], [ %i103, %bb78 ]
+  store float %i182, ptr poison, align 1
+  store float %i174, ptr poison, align 1
+  br i1 poison, label %bb186, label %bb184
+
+bb184:
+  br label %bb185
+
+bb185:
+  br i1 poison, label %bb185, label %bb186
+
+bb186:
+  %i187 = phi nsz float [ %i178, %bb167 ], [ poison, %bb185 ]
+  ret void
+}

From a1441ca74708a557606bf68d42fca40bb03dbd74 Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Thu, 29 Aug 2024 16:21:43 -0600
Subject: [PATCH 63/72] [flang][Driver] Add support for -mllvm
 -print-pipeline-passes

The behavior deliberately mimics that of clang. Ideally, -print-pipeline-passes
should be a first-class driver option. Notes to this effect have been added in
the appropriate places in both flang and clang.

---------

Co-authored-by: Tarun Prabhu <tarun.prabhu@gmail.com>
---
 clang/lib/CodeGen/BackendUtil.cpp           |  2 ++
 flang/lib/Frontend/FrontendActions.cpp      | 18 ++++++++++++++++++
 flang/test/Driver/print-pipeline-passes.f90 |  7 +++++++
 3 files changed, 27 insertions(+)
 create mode 100644 flang/test/Driver/print-pipeline-passes.f90

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 026f16484c0949..564efa3181d188 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1095,6 +1095,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
+  // FIXME: This should eventually be replaced by a first-class driver option.
+  // This should be done for both clang and flang simultaneously.
   // Print a textual, '-passes=' compatible, representation of pipeline if
   // requested.
   if (PrintPipelinePasses) {
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index c9991ff18d1754..cda82bcb7ecc71 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -79,6 +79,10 @@
 
 #include "flang/Tools/CLOptions.inc"
 
+namespace llvm {
+extern cl::opt<bool> PrintPipelinePasses;
+} // namespace llvm
+
 using namespace Fortran::frontend;
 
 // Declare plugin extension function declarations.
@@ -1016,6 +1020,20 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
   else if (action == BackendActionTy::Backend_EmitLL)
     mpm.addPass(llvm::PrintModulePass(os));
 
+  // FIXME: This should eventually be replaced by a first-class driver option.
+  // This should be done for both flang and clang simultaneously.
+  // Print a textual, '-passes=' compatible, representation of pipeline if
+  // requested. In this case, don't run the passes. This mimics the behavior of
+  // clang.
+  if (llvm::PrintPipelinePasses) {
+    mpm.printPipeline(llvm::outs(), [&pic](llvm::StringRef className) {
+      auto passName = pic.getPassNameForClassName(className);
+      return passName.empty() ? className : passName;
+    });
+    llvm::outs() << "\n";
+    return;
+  }
+
   // Run the passes.
   mpm.run(*llvmModule, mam);
 }
diff --git a/flang/test/Driver/print-pipeline-passes.f90 b/flang/test/Driver/print-pipeline-passes.f90
new file mode 100644
index 00000000000000..72c213802508e5
--- /dev/null
+++ b/flang/test/Driver/print-pipeline-passes.f90
@@ -0,0 +1,7 @@
+! RUN: %flang_fc1 -mllvm -print-pipeline-passes -emit-llvm-bc -o /dev/null -O0 %s 2>&1 | FileCheck %s
+
+! Just check a few passes to ensure that something reasonable is being printed.
+! CHECK: always-inline
+! CHECK-SAME: annotation-remarks
+
+end program

From dac1f7ba8eefb6931abb68a6aebb273de5a60f74 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 29 Aug 2024 15:28:02 -0700
Subject: [PATCH 64/72] [NVPTX] fixup incorrect rounding mode for int to float
 conversion (#106600)

`uitofp` and `sitofp` instructions use the default rounding mode which
is defined as round-to-nearest.
---
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 17 +++++-----
 .../InstCombine/NVPTX/nvvm-intrins.ll         | 31 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 7aa63f9fc0c966..9a8ea8f87896ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -290,15 +290,16 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     case Intrinsic::nvvm_d2ull_rz:
     case Intrinsic::nvvm_f2ull_rz:
       return {Instruction::FPToUI};
-    case Intrinsic::nvvm_i2d_rz:
-    case Intrinsic::nvvm_i2f_rz:
-    case Intrinsic::nvvm_ll2d_rz:
-    case Intrinsic::nvvm_ll2f_rz:
+    // Integer to floating-point uses RN rounding, not RZ
+    case Intrinsic::nvvm_i2d_rn:
+    case Intrinsic::nvvm_i2f_rn:
+    case Intrinsic::nvvm_ll2d_rn:
+    case Intrinsic::nvvm_ll2f_rn:
       return {Instruction::SIToFP};
-    case Intrinsic::nvvm_ui2d_rz:
-    case Intrinsic::nvvm_ui2f_rz:
-    case Intrinsic::nvvm_ull2d_rz:
-    case Intrinsic::nvvm_ull2f_rz:
+    case Intrinsic::nvvm_ui2d_rn:
+    case Intrinsic::nvvm_ui2f_rn:
+    case Intrinsic::nvvm_ull2d_rn:
+    case Intrinsic::nvvm_ull2f_rn:
       return {Instruction::UIToFP};
 
     // NVVM intrinsics that map to LLVM binary ops.
diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
index 633aa43c4fc89c..35a81fffac3a70 100644
--- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -238,49 +238,49 @@ define i64 @test_f2ull(float %a) #0 {
 ; CHECK-LABEL: @test_i2d
 define double @test_i2d(i32 %a) #0 {
 ; CHECK: sitofp i32 %a to double
-  %ret = call double @llvm.nvvm.i2d.rz(i32 %a)
+  %ret = call double @llvm.nvvm.i2d.rn(i32 %a)
   ret double %ret
 }
 ; CHECK-LABEL: @test_i2f
 define float @test_i2f(i32 %a) #0 {
 ; CHECK: sitofp i32 %a to float
-  %ret = call float @llvm.nvvm.i2f.rz(i32 %a)
+  %ret = call float @llvm.nvvm.i2f.rn(i32 %a)
   ret float %ret
 }
 ; CHECK-LABEL: @test_ll2d
 define double @test_ll2d(i64 %a) #0 {
 ; CHECK: sitofp i64 %a to double
-  %ret = call double @llvm.nvvm.ll2d.rz(i64 %a)
+  %ret = call double @llvm.nvvm.ll2d.rn(i64 %a)
   ret double %ret
 }
 ; CHECK-LABEL: @test_ll2f
 define float @test_ll2f(i64 %a) #0 {
 ; CHECK: sitofp i64 %a to float
-  %ret = call float @llvm.nvvm.ll2f.rz(i64 %a)
+  %ret = call float @llvm.nvvm.ll2f.rn(i64 %a)
   ret float %ret
 }
 ; CHECK-LABEL: @test_ui2d
 define double @test_ui2d(i32 %a) #0 {
 ; CHECK: uitofp i32 %a to double
-  %ret = call double @llvm.nvvm.ui2d.rz(i32 %a)
+  %ret = call double @llvm.nvvm.ui2d.rn(i32 %a)
   ret double %ret
 }
 ; CHECK-LABEL: @test_ui2f
 define float @test_ui2f(i32 %a) #0 {
 ; CHECK: uitofp i32 %a to float
-  %ret = call float @llvm.nvvm.ui2f.rz(i32 %a)
+  %ret = call float @llvm.nvvm.ui2f.rn(i32 %a)
   ret float %ret
 }
 ; CHECK-LABEL: @test_ull2d
 define double @test_ull2d(i64 %a) #0 {
 ; CHECK: uitofp i64 %a to double
-  %ret = call double @llvm.nvvm.ull2d.rz(i64 %a)
+  %ret = call double @llvm.nvvm.ull2d.rn(i64 %a)
   ret double %ret
 }
 ; CHECK-LABEL: @test_ull2f
 define float @test_ull2f(i64 %a) #0 {
 ; CHECK: uitofp i64 %a to float
-  %ret = call float @llvm.nvvm.ull2f.rz(i64 %a)
+  %ret = call float @llvm.nvvm.ull2f.rn(i64 %a)
   ret float %ret
 }
 
@@ -428,10 +428,10 @@ declare float @llvm.nvvm.fmax.ftz.f(float, float)
 declare double @llvm.nvvm.fmin.d(double, double)
 declare float @llvm.nvvm.fmin.f(float, float)
 declare float @llvm.nvvm.fmin.ftz.f(float, float)
-declare double @llvm.nvvm.i2d.rz(i32)
-declare float @llvm.nvvm.i2f.rz(i32)
-declare double @llvm.nvvm.ll2d.rz(i64)
-declare float @llvm.nvvm.ll2f.rz(i64)
+declare double @llvm.nvvm.i2d.rn(i32)
+declare float @llvm.nvvm.i2f.rn(i32)
+declare double @llvm.nvvm.ll2d.rn(i64)
+declare float @llvm.nvvm.ll2f.rn(i64)
 declare double @llvm.nvvm.lohi.i2d(i32, i32)
 declare double @llvm.nvvm.mul.rn.d(double, double)
 declare float @llvm.nvvm.mul.rn.f(float, float)
@@ -450,8 +450,7 @@ declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
 declare double @llvm.nvvm.trunc.d(double)
 declare float @llvm.nvvm.trunc.f(float)
 declare float @llvm.nvvm.trunc.ftz.f(float)
-declare double @llvm.nvvm.ui2d.rz(i32)
+declare double @llvm.nvvm.ui2d.rn(i32)
 declare float @llvm.nvvm.ui2f.rn(i32)
-declare float @llvm.nvvm.ui2f.rz(i32)
-declare double @llvm.nvvm.ull2d.rz(i64)
-declare float @llvm.nvvm.ull2f.rz(i64)
+declare double @llvm.nvvm.ull2d.rn(i64)
+declare float @llvm.nvvm.ull2f.rn(i64)

From ec68dc1ca4d967b599f1202855917d5ec9cae52f Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Thu, 29 Aug 2024 15:59:26 -0700
Subject: [PATCH 65/72] [compiler-rt] Work around incompatible Windows
 definitions of (S)SIZE_T

The interceptor types are supposed to match size_t (and the non-Windows
ssize_t) exactly, but on 32-bit Windows `size_t` uses `unsigned int`
whereas `SIZE_T` is `unsigned long`. The current definition results in
`uptr` not matching `uintptr_t` since we otherwise get typedef
redefinition errors. Work around this by using a #define instead of
a typedef when defining SIZE_T.

It would probably be cleaner to stop using these uppercase types, but
that is a rather invasive change and this one is the minimal change to
allow uptr to match uintptr_t on Windows.

To ensure this compiles on Windows, we also remove the interceptor.h
defines of uptr (that do not always match __sanitizer::uptr) and rely
on __sanitizer::uptr instead. The interceptor types most likely predate
those other types so clean up the unnecessary definition while here.

This also reverts commit 18e06e3e2f3d47433e1ed323b8725c76035fc1ac and
commit bb27dd853a713866c025a94ead8f03a1e25d1b6e.

Reviewed By: mstorsjo, vitalybuka

Pull Request: https://github.com/llvm/llvm-project/pull/106311
---
 compiler-rt/lib/interception/interception.h   | 25 ++++++++-------
 .../interception/interception_type_test.cpp   | 31 ++++++++++++-------
 .../sanitizer_internal_defs.h                 | 11 +------
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 38c152952e3232..0580d97edda681 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -25,8 +25,19 @@
 
 // These typedefs should be used only in the interceptor definitions to replace
 // the standard system types (e.g. SSIZE_T instead of ssize_t)
-typedef __sanitizer::uptr    SIZE_T;
-typedef __sanitizer::sptr    SSIZE_T;
+// On Windows the system headers (basetsd.h) provide a conflicting definition
+// of SIZE_T/SSIZE_T that do not match the real size_t/ssize_t for 32-bit
+// systems (using long instead of the expected int). Work around the typedef
+// redefinition by #defining SIZE_T instead of using a typedef.
+// TODO: We should be using __sanitizer::usize (and a new ssize) instead of
+// these new macros as long as we ensure they match the real system definitions.
+#if SANITIZER_WINDOWS
+// Ensure that (S)SIZE_T were already defined as we are about to override them.
+#  include <basetsd.h>
+#endif
+
+#define SIZE_T __sanitizer::usize
+#define SSIZE_T __sanitizer::sptr
 typedef __sanitizer::sptr    PTRDIFF_T;
 typedef __sanitizer::s64     INTMAX_T;
 typedef __sanitizer::u64     UINTMAX_T;
@@ -338,16 +349,8 @@ const interpose_substitution substitution_##func_name[]             \
 #endif
 
 // ISO C++ forbids casting between pointer-to-function and pointer-to-object,
-// so we use casting via an integral type __interception::uptr,
-// assuming that system is POSIX-compliant. Using other hacks seem
-// challenging, as we don't even pass function type to
-// INTERCEPT_FUNCTION macro, only its name.
+// so we use casts via uintptr_t (the local __sanitizer::uptr equivalent).
 namespace __interception {
-#if defined(_WIN64)
-typedef unsigned long long uptr;
-#else
-typedef unsigned long uptr;
-#endif  // _WIN64
 
 #if defined(__ELF__) && !SANITIZER_FUCHSIA
 // The use of interceptors makes many sanitizers unusable for static linking.
diff --git a/compiler-rt/lib/interception/interception_type_test.cpp b/compiler-rt/lib/interception/interception_type_test.cpp
index 7c3de82a1e869c..41041ce6f95c88 100644
--- a/compiler-rt/lib/interception/interception_type_test.cpp
+++ b/compiler-rt/lib/interception/interception_type_test.cpp
@@ -12,28 +12,35 @@
 //===----------------------------------------------------------------------===//
 
 #include "interception.h"
+#include "sanitizer_common/sanitizer_type_traits.h"
 
-#if SANITIZER_LINUX || SANITIZER_APPLE
-
-#include <sys/types.h>
+#if __has_include(<sys/types.h>)
+#  include <sys/types.h>
+#endif
 #include <stddef.h>
 #include <stdint.h>
 
-COMPILER_CHECK(sizeof(::SIZE_T) == sizeof(size_t));
-COMPILER_CHECK(sizeof(::SSIZE_T) == sizeof(ssize_t));
-COMPILER_CHECK(sizeof(::PTRDIFF_T) == sizeof(ptrdiff_t));
+COMPILER_CHECK((__sanitizer::is_same<__sanitizer::uptr, ::uintptr_t>::value));
+COMPILER_CHECK((__sanitizer::is_same<__sanitizer::sptr, ::intptr_t>::value));
+COMPILER_CHECK((__sanitizer::is_same<__sanitizer::usize, ::size_t>::value));
+COMPILER_CHECK((__sanitizer::is_same<::PTRDIFF_T, ::ptrdiff_t>::value));
+COMPILER_CHECK((__sanitizer::is_same<::SIZE_T, ::size_t>::value));
+#if !SANITIZER_WINDOWS
+// No ssize_t on Windows.
+COMPILER_CHECK((__sanitizer::is_same<::SSIZE_T, ::ssize_t>::value));
+#endif
+// TODO: These are not actually the same type on Linux (long vs long long)
 COMPILER_CHECK(sizeof(::INTMAX_T) == sizeof(intmax_t));
+COMPILER_CHECK(sizeof(::UINTMAX_T) == sizeof(uintmax_t));
 
-#  if SANITIZER_GLIBC || SANITIZER_ANDROID
+#if SANITIZER_GLIBC || SANITIZER_ANDROID
 COMPILER_CHECK(sizeof(::OFF64_T) == sizeof(off64_t));
-#  endif
+#endif
 
 // The following are the cases when pread (and friends) is used instead of
 // pread64. In those cases we need OFF_T to match off_t. We don't care about the
 // rest (they depend on _FILE_OFFSET_BITS setting when building an application).
-# if SANITIZER_ANDROID || !defined _FILE_OFFSET_BITS || \
-  _FILE_OFFSET_BITS != 64
+#if !SANITIZER_WINDOWS && (SANITIZER_ANDROID || !defined _FILE_OFFSET_BITS || \
+                           _FILE_OFFSET_BITS != 64)
 COMPILER_CHECK(sizeof(::OFF_T) == sizeof(off_t));
-# endif
-
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index 1607436e476e52..fefe28e811767d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -143,7 +143,7 @@ namespace __sanitizer {
 typedef unsigned long long uptr;
 typedef signed long long sptr;
 #else
-#  if (SANITIZER_WORDSIZE == 64) || SANITIZER_APPLE || SANITIZER_WINDOWS
+#  if (SANITIZER_WORDSIZE == 64) || SANITIZER_APPLE
 typedef unsigned long uptr;
 typedef signed long sptr;
 #  else
@@ -194,16 +194,7 @@ typedef u64  OFF64_T;
 #ifdef __SIZE_TYPE__
 typedef __SIZE_TYPE__ usize;
 #else
-// Since we use this for operator new, usize must match the real size_t, but on
-// 32-bit Windows the definition of uptr does not actually match uintptr_t or
-// size_t because we are working around typedef mismatches for the (S)SIZE_T
-// types used in interception.h.
-// Until the definition of uptr has been fixed we have to special case Win32.
-#  if SANITIZER_WINDOWS && SANITIZER_WORDSIZE == 32
-typedef unsigned int usize;
-#  else
 typedef uptr usize;
-#  endif
 #endif
 
 typedef u64 tid_t;

From 9df92cbd1addb03c7169f05cf3b628f88c610224 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Thu, 29 Aug 2024 16:00:44 -0700
Subject: [PATCH 66/72] [compiler-rt] Remove duplicates of sanitizer_common
 functions

These functions in interception_win.cpp already exist in
sanitizer_common. Use those instead.

Reviewed By: mstorsjo

Pull Request: https://github.com/llvm/llvm-project/pull/106488
---
 .../lib/interception/interception_win.cpp     | 56 ++++---------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index a638e66eccee58..0a9f2b62a78a58 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -127,9 +127,11 @@
 #include "interception.h"
 
 #if SANITIZER_WINDOWS
-#include "sanitizer_common/sanitizer_platform.h"
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+#  include "sanitizer_common/sanitizer_common.h"
+#  include "sanitizer_common/sanitizer_libc.h"
+#  include "sanitizer_common/sanitizer_platform.h"
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
 
 namespace __interception {
 
@@ -186,40 +188,6 @@ static uptr GetMmapGranularity() {
   return si.dwAllocationGranularity;
 }
 
-UNUSED static uptr RoundUpTo(uptr size, uptr boundary) {
-  return (size + boundary - 1) & ~(boundary - 1);
-}
-
-// FIXME: internal_str* and internal_mem* functions should be moved from the
-// ASan sources into interception/.
-
-static size_t _strlen(const char *str) {
-  const char* p = str;
-  while (*p != '\0') ++p;
-  return p - str;
-}
-
-static char* _strchr(char* str, char c) {
-  while (*str) {
-    if (*str == c)
-      return str;
-    ++str;
-  }
-  return nullptr;
-}
-
-static void _memset(void *p, int value, size_t sz) {
-  for (size_t i = 0; i < sz; ++i)
-    ((char*)p)[i] = (char)value;
-}
-
-static void _memcpy(void *dst, void *src, size_t sz) {
-  char *dst_c = (char*)dst,
-       *src_c = (char*)src;
-  for (size_t i = 0; i < sz; ++i)
-    dst_c[i] = src_c[i];
-}
-
 static bool ChangeMemoryProtection(
     uptr address, uptr size, DWORD *old_protection) {
   return ::VirtualProtect((void*)address, size,
@@ -266,7 +234,7 @@ static bool FunctionHasPadding(uptr address, uptr size) {
 }
 
 static void WritePadding(uptr from, uptr size) {
-  _memset((void*)from, 0xCC, (size_t)size);
+  internal_memset((void *)from, 0xCC, (size_t)size);
 }
 
 static void WriteJumpInstruction(uptr from, uptr target) {
@@ -737,8 +705,8 @@ static bool CopyInstructions(uptr to, uptr from, size_t size) {
     size_t instruction_size = GetInstructionSize(from + cursor, &rel_offset);
     if (!instruction_size)
       return false;
-    _memcpy((void *)(to + cursor), (void *)(from + cursor),
-            (size_t)instruction_size);
+    internal_memcpy((void *)(to + cursor), (void *)(from + cursor),
+                    (size_t)instruction_size);
     if (rel_offset) {
 #  if SANITIZER_WINDOWS64
       // we want to make sure that the new relative offset still fits in 32-bits
@@ -1027,7 +995,7 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
 
   for (DWORD i = 0; i < exports->NumberOfNames; i++) {
     RVAPtr<char> name(module, names[i]);
-    if (!strcmp(func_name, name)) {
+    if (!internal_strcmp(func_name, name)) {
       DWORD index = ordinals[i];
       RVAPtr<char> func(module, functions[index]);
 
@@ -1039,13 +1007,13 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
         // format: "<module> . <function_name>" that is stored into the
         // exported directory.
         char function_name[256];
-        size_t funtion_name_length = _strlen(func);
+        size_t funtion_name_length = internal_strlen(func);
         if (funtion_name_length >= sizeof(function_name) - 1)
           InterceptionFailed();
 
-        _memcpy(function_name, func, funtion_name_length);
+        internal_memcpy(function_name, func, funtion_name_length);
         function_name[funtion_name_length] = '\0';
-        char* separator = _strchr(function_name, '.');
+        char *separator = internal_strchr(function_name, '.');
         if (!separator)
           InterceptionFailed();
         *separator = '\0';

From dbbfc952f0d4703b89fa238e2aba98f1229fb972 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 30 Aug 2024 07:46:06 +0800
Subject: [PATCH 67/72] [RISCV] Separate ActiveElementsAffectResult into VL and
 Mask flags (#106517)

In #106110 we had to mark v[f]slide1down.vx as
ActiveElementsAffectResult since the elements in the body depend on VL.
However it doesn't depend on the mask, so this was overly conservative
and broke the vmerge peephole.

We can recover this by splitting up ActiveElementsAffectResult into VL
and Mask bits, so we can more accurately model v[f]slide1down.vx and
re-enable the peephole.
---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 21 ++++++++----
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 11 +++----
 llvm/lib/Target/RISCV/RISCVInstrFormats.td    | 18 +++++++++--
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td      | 32 +++++++++----------
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td    |  4 +--
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td |  4 +--
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td    |  4 +--
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp |  4 +--
 .../RISCV/rvv/rvv-peephole-vmerge-vops.ll     |  7 ++--
 9 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index c65bd5b1d33631..7a0b35c1afce23 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -124,8 +124,11 @@ enum {
   TargetOverlapConstraintTypeShift = UsesVXRMShift + 1,
   TargetOverlapConstraintTypeMask = 3ULL << TargetOverlapConstraintTypeShift,
 
-  ActiveElementsAffectResultShift = TargetOverlapConstraintTypeShift + 2,
-  ActiveElementsAffectResultMask = 1ULL << ActiveElementsAffectResultShift,
+  ElementsDependOnVLShift = TargetOverlapConstraintTypeShift + 2,
+  ElementsDependOnVLMask = 1ULL << ElementsDependOnVLShift,
+
+  ElementsDependOnMaskShift = ElementsDependOnVLShift + 1,
+  ElementsDependOnMaskMask = 1ULL << ElementsDependOnMaskShift,
 };
 
 // Helper functions to read TSFlags.
@@ -174,10 +177,16 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
 /// \returns true if this instruction uses vxrm
 static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
 
-/// \returns true if the result isn't element-wise,
-/// e.g. vredsum.vs/vcompress.vm/viota.m
-static inline bool activeElementsAffectResult(uint64_t TSFlags) {
-  return TSFlags & ActiveElementsAffectResultMask;
+/// \returns true if the elements in the body are affected by VL,
+/// e.g. vslide1down.vx/vredsum.vs/viota.m
+static inline bool elementsDependOnVL(uint64_t TSFlags) {
+  return TSFlags & ElementsDependOnVLMask;
+}
+
+/// \returns true if the elements in the body are affected by the mask,
+/// e.g. vredsum.vs/viota.m
+static inline bool elementsDependOnMask(uint64_t TSFlags) {
+  return TSFlags & ElementsDependOnMaskMask;
 }
 
 static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 48fe788a81dffc..9cec6278abc2c5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3941,12 +3941,11 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // active elements, like viota.m or vredsum. This transformation is illegal
   // for these if we change the active elements (i.e. mask or VL).
   const MCInstrDesc &TrueBaseMCID = TII->get(RISCV::getRVVMCOpcode(TrueOpc));
-  if (RISCVII::activeElementsAffectResult(TrueBaseMCID.TSFlags)) {
-    if (Mask && !usesAllOnesMask(Mask, Glue))
-      return false;
-    if (TrueVL != VL)
-      return false;
-  }
+  if (RISCVII::elementsDependOnVL(TrueBaseMCID.TSFlags) && (TrueVL != VL))
+    return false;
+  if (RISCVII::elementsDependOnMask(TrueBaseMCID.TSFlags) &&
+      (Mask && !usesAllOnesMask(Mask, Glue)))
+    return false;
 
   // If we end up changing the VL or mask of True, then we need to make sure it
   // doesn't raise any observable fp exceptions, since changing the active
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 95f157064d73e2..a389320adc8763 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -158,6 +158,15 @@ def OPC_SYSTEM    : RISCVOpcode<"SYSTEM",    0b1110011>;
 def OPC_OP_VE     : RISCVOpcode<"OP_VE",     0b1110111>;
 def OPC_CUSTOM_3  : RISCVOpcode<"CUSTOM_3",  0b1111011>;
 
+class EltDeps<bit vl, bit mask> {
+  bit VL = vl;
+  bit Mask = mask;
+}
+
+def EltDepsNone      : EltDeps<vl=0, mask=0>;
+def EltDepsVL        : EltDeps<vl=1, mask=0>;
+def EltDepsVLMask    : EltDeps<vl=1, mask=1>;
+
 class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
                    list<dag> pattern, InstFormat format> : Instruction {
   let Namespace = "RISCV";
@@ -224,8 +233,13 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   bits<2> TargetOverlapConstraintType = 0;
   let TSFlags{22-21} = TargetOverlapConstraintType;
 
-  bit ActiveElementsAffectResult = 0;
-  let TSFlags{23} = ActiveElementsAffectResult;
+  // Most vector instructions are elementwise, but some may depend on the value
+  // of VL (e.g. vslide1down.vx), and others may depend on the VL and mask
+  // (e.g. vredsum.vs, viota.m). Mark these instructions so that peepholes avoid
+  // changing their VL and/or mask.
+  EltDeps ElementsDependOn = EltDepsNone;
+  let TSFlags{23} = ElementsDependOn.VL;
+  let TSFlags{24} = ElementsDependOn.Mask;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 878859fc1d0864..738bb5d9bd65bf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1503,7 +1503,7 @@ defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
 let Predicates = [HasVInstructions] in {
 
 // Vector Single-Width Integer Reduction Instructions
-let RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1 in {
+let RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
 defm VREDSUM  : VRED_MV_V<"vredsum", 0b000000>;
 defm VREDMAXU : VREDMINMAX_MV_V<"vredmaxu", 0b000110>;
 defm VREDMAX  : VREDMINMAX_MV_V<"vredmax", 0b000111>;
@@ -1512,23 +1512,23 @@ defm VREDMIN  : VREDMINMAX_MV_V<"vredmin", 0b000101>;
 defm VREDAND  : VRED_MV_V<"vredand", 0b000001>;
 defm VREDOR   : VRED_MV_V<"vredor", 0b000010>;
 defm VREDXOR  : VRED_MV_V<"vredxor", 0b000011>;
-} // RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1
+} // RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 // Vector Widening Integer Reduction Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1 in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
 defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 } // Predicates = [HasVInstructions]
 
 let Predicates = [HasVInstructionsAnyF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
-let RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1 in {
+let RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
 let Uses = [FRM], mayRaiseFPException = true in {
 defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
 defm VFREDUSUM : VRED_FV_V<"vfredusum", 0b000001>;
@@ -1537,13 +1537,13 @@ let mayRaiseFPException = true in {
 defm VFREDMAX : VREDMINMAX_FV_V<"vfredmax", 0b000111>;
 defm VFREDMIN : VREDMINMAX_FV_V<"vfredmin", 0b000101>;
 }
-} // RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1
+} // RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 def : InstAlias<"vfredsum.vs $vd, $vs2, $vs1$vm",
                 (VFREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
 
 // Vector Widening Floating-Point Reduction Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1 in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
@@ -1552,7 +1552,7 @@ let Uses = [FRM], mayRaiseFPException = true in {
 defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
 defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>;
 }
-} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
                 (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
@@ -1586,7 +1586,7 @@ def : InstAlias<"vmornot.mm $vd, $vs2, $vs1",
                 (VMORN_MM VR:$vd, VR:$vs2, VR:$vs1), 0>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
-    RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1 in {
+    RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
 
 // Vector mask population count vcpop
 def VCPOP_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
@@ -1600,12 +1600,12 @@ def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
                        "vfirst.m", "$vd, $vs2$vm">,
               SchedUnaryMC<"WriteVMFFSV", "ReadVMFFSV">;
 
-} // hasSideEffects = 0, mayLoad = 0, mayStore = 0, RVVConstraint = NoConstraint, ActiveElementsAffectResult = 1
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0, RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 def : InstAlias<"vpopc.m $vd, $vs2$vm",
                 (VCPOP_M GPR:$vd, VR:$vs2, VMaskOp:$vm), 0>;
 
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ActiveElementsAffectResult = 1 in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in {
 
 // vmsbf.m set-before-first mask bit
 defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>;
@@ -1616,7 +1616,7 @@ defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
 // Vector Iota Instruction
 defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>;
 
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ActiveElementsAffectResult = 1
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask
 
 // Vector Element Index Instruction
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
@@ -1665,7 +1665,7 @@ defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, /*slidesUp=*/true>;
 defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
 defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, /*slidesUp=*/false>;
-let ActiveElementsAffectResult = 1 in
+let ElementsDependOn = EltDepsVL in
 defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
 } // Predicates = [HasVInstructions]
 
@@ -1673,7 +1673,7 @@ let Predicates = [HasVInstructionsAnyF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-let ActiveElementsAffectResult = 1 in
+let ElementsDependOn = EltDepsVL in
 defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
 } // Predicates = [HasVInstructionsAnyF]
 
@@ -1688,9 +1688,9 @@ def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
-let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ActiveElementsAffectResult = 1 in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ElementsDependOn = EltDepsVLMask in {
 defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ActiveElementsAffectResult = 1
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress, ElementsDependOn = EltDepsVLMask
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
     RVVConstraint = NoConstraint in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index bd5319af80ffc8..3b726e2e9bdc9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -75,7 +75,7 @@ class RVInstVCCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
 
   let Uses = [VTYPE, VL];
   let RVVConstraint = NoConstraint;
-  let ActiveElementsAffectResult = 1;
+  let ElementsDependOn = EltDepsVLMask;
 }
 
 class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
@@ -99,7 +99,7 @@ class RVInstVCFCustom2<bits<4> funct6_hi4, bits<3> funct3, dag outs, dag ins,
 
   let Uses = [VTYPE, VL];
   let RVVConstraint = NoConstraint;
-  let ActiveElementsAffectResult = 1;
+  let ElementsDependOn = EltDepsVLMask;
 }
 
 class VCIXInfo<string suffix, VCIXType type, DAGOperand TyRd,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 555bddc740678d..c0fe624a9b709c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -235,13 +235,13 @@ class THStoreUpdate<bits<5> funct5, string opcodestr>
 //===----------------------------------------------------------------------===//
 
 multiclass THVdotVMAQA_VX<string opcodestr, bits<6> funct6> {
-  let RVVConstraint = WidenV, ActiveElementsAffectResult = 1 in
+  let RVVConstraint = WidenV, ElementsDependOn = EltDepsVLMask in
   def _VX : THVdotALUrVX<funct6, OPMVX, opcodestr # ".vx", EarlyClobber=1>;
 }
 
 multiclass THVdotVMAQA<string opcodestr, bits<6> funct6>
     : THVdotVMAQA_VX<opcodestr, funct6> {
-  let RVVConstraint = WidenV, ActiveElementsAffectResult = 1 in
+  let RVVConstraint = WidenV, ElementsDependOn = EltDepsVLMask in
   def _VV   : THVdotALUrVV<funct6, OPMVX, opcodestr # ".vv", EarlyClobber=1>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index cafd259031746d..e19a11805c9c08 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -140,7 +140,7 @@ let Predicates = [HasStdExtZvkb] in {
   defm VROR_V   : VROR_IV_V_X_I<"vror", 0b010100>;
 } // Predicates = [HasStdExtZvkb]
 
-let ActiveElementsAffectResult = 1 in {
+let ElementsDependOn = EltDepsVLMask in {
 
 let Predicates = [HasStdExtZvkg], RVVConstraint = NoConstraint in {
   def VGHSH_VV : PALUVVNoVmTernary<0b101100, OPMVV, "vghsh.vv">,
@@ -198,7 +198,7 @@ let Predicates = [HasStdExtZvksh], RVVConstraint = VS2Constraint in {
                   SchedUnaryMC<"WriteVSM3MEV", "ReadVSM3MEV">;
 } // Predicates = [HasStdExtZvksh]
 
-} // ActiveElementsAffectResult = 1
+} // ElementsDependOn = EltDepsVLMask
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 34e5d9224f7150..0f8f9442877e33 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -156,9 +156,9 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
     return false;
 
-  bool ActiveElementsAffectResult = RISCVII::activeElementsAffectResult(
+  bool ElementsDependOnVL = RISCVII::elementsDependOnVL(
       TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags);
-  if (ActiveElementsAffectResult || Src->mayRaiseFPException())
+  if (ElementsDependOnVL || Src->mayRaiseFPException())
     return false;
 
   MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 2ff775d6d14f43..6700920cebff0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -776,15 +776,12 @@ define <vscale x 2 x i32> @vpselect_vslide1up(<vscale x 2 x i32> %passthru, <vsc
   ret <vscale x 2 x i32> %b
 }
 
-; FIXME: We can still fold this given that the vmerge and the vslide1down have
-; the same vl.
 declare <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32, i64)
 define <vscale x 2 x i32> @vpselect_vslide1down(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %v, i32 %x, <vscale x 2 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vpselect_vslide1down:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
   %1 = zext i32 %vl to i64
   %a = call <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> %v, i32 %x, i64 %1)

From 034f2b380bd2d84e8cfbcb647b50602711d170c7 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 29 Aug 2024 17:19:12 -0700
Subject: [PATCH 68/72] [SandboxIR] Implement SandboxIR Type (#106294)

This patch implements sandboxir::Type, a thin wrapper of llvm::Type.
This is designed very similarly to sandbox::Value. Context owns all
sandboxir::Type objects and maintains a map between llvm::Type and
sandboxir::Type.

There are a couple of reasons for migrating from llvm::Type to
sandboxir::Type:
- Creating an llvm::Type from within SandboxIR-only code doesn't work
well because it requires you to pass llvm::Context to functions like
llvm::Type::getInt32Ty(C), but you wouldn't normally have access to
llvm::Context C. In unit tests this is not such a big deal because you
have access to both, but it will become an issue in SandboxIR-only code.
- Not being able to get the sandboxir::Context from llvm::Type results
in awkward sandboir APIs with additional sandboxir::Context arguments.
- llvm::Type::getContext() can basically give you access to the whole
LLVM IR, which we should try to avoid.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h    |  77 +++---
 llvm/include/llvm/SandboxIR/Type.h         | 299 +++++++++++++++++++++
 llvm/lib/SandboxIR/CMakeLists.txt          |   1 +
 llvm/lib/SandboxIR/SandboxIR.cpp           | 115 ++++++--
 llvm/lib/SandboxIR/Type.cpp                |  48 ++++
 llvm/unittests/SandboxIR/CMakeLists.txt    |   1 +
 llvm/unittests/SandboxIR/SandboxIRTest.cpp | 100 ++++---
 llvm/unittests/SandboxIR/TrackerTest.cpp   |   9 +-
 llvm/unittests/SandboxIR/TypesTest.cpp     | 253 +++++++++++++++++
 9 files changed, 795 insertions(+), 108 deletions(-)
 create mode 100644 llvm/include/llvm/SandboxIR/Type.h
 create mode 100644 llvm/lib/SandboxIR/Type.cpp
 create mode 100644 llvm/unittests/SandboxIR/TypesTest.cpp

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index d4c2efca4ecfa0..0f7752eda6d66f 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -102,6 +102,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Tracker.h"
+#include "llvm/SandboxIR/Type.h"
 #include "llvm/SandboxIR/Use.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iterator>
@@ -386,7 +387,7 @@ class Value {
     return Cnt == Num;
   }
 
-  Type *getType() const { return Val->getType(); }
+  Type *getType() const;
 
   Context &getContext() const { return Ctx; }
 
@@ -574,8 +575,7 @@ class ConstantInt : public Constant {
 public:
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
-  static ConstantInt *get(Type *Ty, uint64_t V, Context &Ctx,
-                          bool IsSigned = false);
+  static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
 
   // TODO: Implement missing functions.
 
@@ -1024,10 +1024,7 @@ class ExtractElementInst final
   Value *getIndexOperand() { return getOperand(1); }
   const Value *getVectorOperand() const { return getOperand(0); }
   const Value *getIndexOperand() const { return getOperand(1); }
-
-  VectorType *getVectorOperandType() const {
-    return cast<VectorType>(getVectorOperand()->getType());
-  }
+  VectorType *getVectorOperandType() const;
 };
 
 class ShuffleVectorInst final
@@ -1072,9 +1069,7 @@ class ShuffleVectorInst final
   }
 
   /// Overload to return most specific vector type.
-  VectorType *getType() const {
-    return cast<llvm::ShuffleVectorInst>(Val)->getType();
-  }
+  VectorType *getType() const;
 
   /// Return the shuffle mask value of this instruction for the given element
   /// index. Return PoisonMaskElem if the element is undef.
@@ -1100,7 +1095,7 @@ class ShuffleVectorInst final
   Constant *getShuffleMaskForBitcode() const;
 
   static Constant *convertShuffleMaskForBitcode(ArrayRef<int> Mask,
-                                                Type *ResultTy, Context &Ctx);
+                                                Type *ResultTy);
 
   void setShuffleMask(ArrayRef<int> Mask);
 
@@ -1646,9 +1641,7 @@ class ExtractValueInst : public UnaryInstruction {
   /// with an extractvalue instruction with the specified parameters.
   ///
   /// Null is returned if the indices are invalid for the specified type.
-  static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs) {
-    return llvm::ExtractValueInst::getIndexedType(Agg, Idxs);
-  }
+  static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
 
   using idx_iterator = llvm::ExtractValueInst::idx_iterator;
 
@@ -1843,9 +1836,7 @@ class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
            Opc == Instruction::ClassID::CallBr;
   }
 
-  FunctionType *getFunctionType() const {
-    return cast<llvm::CallBase>(Val)->getFunctionType();
-  }
+  FunctionType *getFunctionType() const;
 
   op_iterator data_operands_begin() { return op_begin(); }
   const_op_iterator data_operands_begin() const {
@@ -2261,12 +2252,8 @@ class GetElementPtrInst final
     return From->getSubclassID() == ClassID::GetElementPtr;
   }
 
-  Type *getSourceElementType() const {
-    return cast<llvm::GetElementPtrInst>(Val)->getSourceElementType();
-  }
-  Type *getResultElementType() const {
-    return cast<llvm::GetElementPtrInst>(Val)->getResultElementType();
-  }
+  Type *getSourceElementType() const;
+  Type *getResultElementType() const;
   unsigned getAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getAddressSpace();
   }
@@ -2290,9 +2277,7 @@ class GetElementPtrInst final
   static unsigned getPointerOperandIndex() {
     return llvm::GetElementPtrInst::getPointerOperandIndex();
   }
-  Type *getPointerOperandType() const {
-    return cast<llvm::GetElementPtrInst>(Val)->getPointerOperandType();
-  }
+  Type *getPointerOperandType() const;
   unsigned getPointerAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getPointerAddressSpace();
   }
@@ -2843,9 +2828,7 @@ class AllocaInst final : public UnaryInstruction {
     return const_cast<AllocaInst *>(this)->getArraySize();
   }
   /// Overload to return most specific pointer type.
-  PointerType *getType() const {
-    return cast<llvm::AllocaInst>(Val)->getType();
-  }
+  PointerType *getType() const;
   /// Return the address space for the allocation.
   unsigned getAddressSpace() const {
     return cast<llvm::AllocaInst>(Val)->getAddressSpace();
@@ -2861,9 +2844,7 @@ class AllocaInst final : public UnaryInstruction {
     return cast<llvm::AllocaInst>(Val)->getAllocationSizeInBits(DL);
   }
   /// Return the type that is being allocated by the instruction.
-  Type *getAllocatedType() const {
-    return cast<llvm::AllocaInst>(Val)->getAllocatedType();
-  }
+  Type *getAllocatedType() const;
   /// for use only in special circumstances that need to generically
   /// transform a whole instruction (eg: IR linking and vectorization).
   void setAllocatedType(Type *Ty);
@@ -2945,8 +2926,8 @@ class CastInst : public UnaryInstruction {
                        const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From);
-  Type *getSrcTy() const { return cast<llvm::CastInst>(Val)->getSrcTy(); }
-  Type *getDestTy() const { return cast<llvm::CastInst>(Val)->getDestTy(); }
+  Type *getSrcTy() const;
+  Type *getDestTy() const;
 };
 
 /// Instruction that can have a nneg flag (zext/uitofp).
@@ -3126,6 +3107,8 @@ class OpaqueInst : public SingleLLVMInstructionImpl<llvm::Instruction> {
 class Context {
 protected:
   LLVMContext &LLVMCtx;
+  friend class Type;        // For LLVMCtx.
+  friend class PointerType; // For LLVMCtx.
   Tracker IRTracker;
 
   /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all
@@ -3133,6 +3116,16 @@ class Context {
   DenseMap<llvm::Value *, std::unique_ptr<sandboxir::Value>>
       LLVMValueToValueMap;
 
+  /// Type has a protected destructor to prohibit the user from managing the
+  /// lifetime of the Type objects. Context is friend of Type, and this custom
+  /// deleter can destroy Type.
+  struct TypeDeleter {
+    void operator()(Type *Ty) { delete Ty; }
+  };
+  /// Maps LLVM Type to the corresonding sandboxir::Type. Owns all Sandbox IR
+  /// Type objects.
+  DenseMap<llvm::Type *, std::unique_ptr<Type, TypeDeleter>> LLVMTypeToTypeMap;
+
   /// Remove \p V from the maps and returns the unique_ptr.
   std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
   /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively
@@ -3167,7 +3160,6 @@ class Context {
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
   BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
-
   friend class BasicBlock; // For getOrCreateValue().
 
   IRBuilder<ConstantFolder> LLVMIRBuilder;
@@ -3257,6 +3249,17 @@ class Context {
   const sandboxir::Value *getValue(const llvm::Value *V) const {
     return getValue(const_cast<llvm::Value *>(V));
   }
+
+  Type *getType(llvm::Type *LLVMTy) {
+    if (LLVMTy == nullptr)
+      return nullptr;
+    auto Pair = LLVMTypeToTypeMap.insert({LLVMTy, nullptr});
+    auto It = Pair.first;
+    if (Pair.second)
+      It->second = std::unique_ptr<Type, TypeDeleter>(new Type(LLVMTy, *this));
+    return It->second.get();
+  }
+
   /// Create a sandboxir::Function for an existing LLVM IR \p F, including all
   /// blocks and instructions.
   /// This is the main API function for creating Sandbox IR.
@@ -3303,9 +3306,7 @@ class Function : public Constant {
     LLVMBBToBB BBGetter(Ctx);
     return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
   }
-  FunctionType *getFunctionType() const {
-    return cast<llvm::Function>(Val)->getFunctionType();
-  }
+  FunctionType *getFunctionType() const;
 
 #ifndef NDEBUG
   void verify() const final {
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
new file mode 100644
index 00000000000000..4588cd2f738876
--- /dev/null
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -0,0 +1,299 @@
+//===- llvm/SandboxIR/Type.h - Classes for handling data types --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a thin wrapper over llvm::Type.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SANDBOXIR_TYPE_H
+#define LLVM_SANDBOXIR_TYPE_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm::sandboxir {
+
+class Context;
+// Forward declare friend classes for MSVC.
+class PointerType;
+class VectorType;
+class FunctionType;
+#define DEF_INSTR(ID, OPCODE, CLASS) class CLASS;
+#include "llvm/SandboxIR/SandboxIRValues.def"
+
+/// Just like llvm::Type these are immutable, unique, never get freed and can
+/// only be created via static factory methods.
+class Type {
+protected:
+  llvm::Type *LLVMTy;
+  friend class VectorType;   // For LLVMTy.
+  friend class PointerType;  // For LLVMTy.
+  friend class FunctionType; // For LLVMTy.
+  friend class Function;     // For LLVMTy.
+  friend class CallBase;     // For LLVMTy.
+  friend class ConstantInt;  // For LLVMTy.
+  // Friend all instruction classes because `create()` functions use LLVMTy.
+#define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
+  // TODO: Friend DEF_CONST()
+#include "llvm/SandboxIR/SandboxIRValues.def"
+  Context &Ctx;
+
+  Type(llvm::Type *LLVMTy, Context &Ctx) : LLVMTy(LLVMTy), Ctx(Ctx) {}
+  friend class Context; // For constructor and ~Type().
+  ~Type() = default;
+
+public:
+  /// Print the current type.
+  /// Omit the type details if \p NoDetails == true.
+  /// E.g., let %st = type { i32, i16 }
+  /// When \p NoDetails is true, we only print %st.
+  /// Put differently, \p NoDetails prints the type as if
+  /// inlined with the operands when printing an instruction.
+  void print(raw_ostream &OS, bool IsForDebug = false,
+             bool NoDetails = false) const {
+    LLVMTy->print(OS, IsForDebug, NoDetails);
+  }
+
+  Context &getContext() const { return Ctx; }
+
+  /// Return true if this is 'void'.
+  bool isVoidTy() const { return LLVMTy->isVoidTy(); }
+
+  /// Return true if this is 'half', a 16-bit IEEE fp type.
+  bool isHalfTy() const { return LLVMTy->isHalfTy(); }
+
+  /// Return true if this is 'bfloat', a 16-bit bfloat type.
+  bool isBFloatTy() const { return LLVMTy->isBFloatTy(); }
+
+  /// Return true if this is a 16-bit float type.
+  bool is16bitFPTy() const { return LLVMTy->is16bitFPTy(); }
+
+  /// Return true if this is 'float', a 32-bit IEEE fp type.
+  bool isFloatTy() const { return LLVMTy->isFloatTy(); }
+
+  /// Return true if this is 'double', a 64-bit IEEE fp type.
+  bool isDoubleTy() const { return LLVMTy->isDoubleTy(); }
+
+  /// Return true if this is x86 long double.
+  bool isX86_FP80Ty() const { return LLVMTy->isX86_FP80Ty(); }
+
+  /// Return true if this is 'fp128'.
+  bool isFP128Ty() const { return LLVMTy->isFP128Ty(); }
+
+  /// Return true if this is powerpc long double.
+  bool isPPC_FP128Ty() const { return LLVMTy->isPPC_FP128Ty(); }
+
+  /// Return true if this is a well-behaved IEEE-like type, which has a IEEE
+  /// compatible layout as defined by APFloat::isIEEE(), and does not have
+  /// non-IEEE values, such as x86_fp80's unnormal values.
+  bool isIEEELikeFPTy() const { return LLVMTy->isIEEELikeFPTy(); }
+
+  /// Return true if this is one of the floating-point types
+  bool isFloatingPointTy() const { return LLVMTy->isFloatingPointTy(); }
+
+  /// Returns true if this is a floating-point type that is an unevaluated sum
+  /// of multiple floating-point units.
+  /// An example of such a type is ppc_fp128, also known as double-double, which
+  /// consists of two IEEE 754 doubles.
+  bool isMultiUnitFPType() const { return LLVMTy->isMultiUnitFPType(); }
+
+  const fltSemantics &getFltSemantics() const {
+    return LLVMTy->getFltSemantics();
+  }
+
+  /// Return true if this is X86 AMX.
+  bool isX86_AMXTy() const { return LLVMTy->isX86_AMXTy(); }
+
+  /// Return true if this is a target extension type.
+  bool isTargetExtTy() const { return LLVMTy->isTargetExtTy(); }
+
+  /// Return true if this is a target extension type with a scalable layout.
+  bool isScalableTargetExtTy() const { return LLVMTy->isScalableTargetExtTy(); }
+
+  /// Return true if this is a type whose size is a known multiple of vscale.
+  bool isScalableTy() const { return LLVMTy->isScalableTy(); }
+
+  /// Return true if this is a FP type or a vector of FP.
+  bool isFPOrFPVectorTy() const { return LLVMTy->isFPOrFPVectorTy(); }
+
+  /// Return true if this is 'label'.
+  bool isLabelTy() const { return LLVMTy->isLabelTy(); }
+
+  /// Return true if this is 'metadata'.
+  bool isMetadataTy() const { return LLVMTy->isMetadataTy(); }
+
+  /// Return true if this is 'token'.
+  bool isTokenTy() const { return LLVMTy->isTokenTy(); }
+
+  /// True if this is an instance of IntegerType.
+  bool isIntegerTy() const { return LLVMTy->isIntegerTy(); }
+
+  /// Return true if this is an IntegerType of the given width.
+  bool isIntegerTy(unsigned Bitwidth) const {
+    return LLVMTy->isIntegerTy(Bitwidth);
+  }
+
+  /// Return true if this is an integer type or a vector of integer types.
+  bool isIntOrIntVectorTy() const { return LLVMTy->isIntOrIntVectorTy(); }
+
+  /// Return true if this is an integer type or a vector of integer types of
+  /// the given width.
+  bool isIntOrIntVectorTy(unsigned BitWidth) const {
+    return LLVMTy->isIntOrIntVectorTy(BitWidth);
+  }
+
+  /// Return true if this is an integer type or a pointer type.
+  bool isIntOrPtrTy() const { return LLVMTy->isIntOrPtrTy(); }
+
+  /// True if this is an instance of FunctionType.
+  bool isFunctionTy() const { return LLVMTy->isFunctionTy(); }
+
+  /// True if this is an instance of StructType.
+  bool isStructTy() const { return LLVMTy->isStructTy(); }
+
+  /// True if this is an instance of ArrayType.
+  bool isArrayTy() const { return LLVMTy->isArrayTy(); }
+
+  /// True if this is an instance of PointerType.
+  bool isPointerTy() const { return LLVMTy->isPointerTy(); }
+
+  /// Return true if this is a pointer type or a vector of pointer types.
+  bool isPtrOrPtrVectorTy() const { return LLVMTy->isPtrOrPtrVectorTy(); }
+
+  /// True if this is an instance of VectorType.
+  inline bool isVectorTy() const { return LLVMTy->isVectorTy(); }
+
+  /// Return true if this type could be converted with a lossless BitCast to
+  /// type 'Ty'. For example, i8* to i32*. BitCasts are valid for types of the
+  /// same size only where no re-interpretation of the bits is done.
+  /// Determine if this type could be losslessly bitcast to Ty
+  bool canLosslesslyBitCastTo(Type *Ty) const {
+    return LLVMTy->canLosslesslyBitCastTo(Ty->LLVMTy);
+  }
+
+  /// Return true if this type is empty, that is, it has no elements or all of
+  /// its elements are empty.
+  bool isEmptyTy() const { return LLVMTy->isEmptyTy(); }
+
+  /// Return true if the type is "first class", meaning it is a valid type for a
+  /// Value.
+  bool isFirstClassType() const { return LLVMTy->isFirstClassType(); }
+
+  /// Return true if the type is a valid type for a register in codegen. This
+  /// includes all first-class types except struct and array types.
+  bool isSingleValueType() const { return LLVMTy->isSingleValueType(); }
+
+  /// Return true if the type is an aggregate type. This means it is valid as
+  /// the first operand of an insertvalue or extractvalue instruction. This
+  /// includes struct and array types, but does not include vector types.
+  bool isAggregateType() const { return LLVMTy->isAggregateType(); }
+
+  /// Return true if it makes sense to take the size of this type. To get the
+  /// actual size for a particular target, it is reasonable to use the
+  /// DataLayout subsystem to do this.
+  bool isSized(SmallPtrSetImpl<Type *> *Visited = nullptr) const {
+    SmallPtrSet<llvm::Type *, 8> LLVMVisited;
+    LLVMVisited.reserve(Visited->size());
+    for (Type *Ty : *Visited)
+      LLVMVisited.insert(Ty->LLVMTy);
+    return LLVMTy->isSized(&LLVMVisited);
+  }
+
+  /// Return the basic size of this type if it is a primitive type. These are
+  /// fixed by LLVM and are not target-dependent.
+  /// This will return zero if the type does not have a size or is not a
+  /// primitive type.
+  ///
+  /// If this is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
+  /// Note that this may not reflect the size of memory allocated for an
+  /// instance of the type or the number of bytes that are written when an
+  /// instance of the type is stored to memory. The DataLayout class provides
+  /// additional query functions to provide this information.
+  ///
+  TypeSize getPrimitiveSizeInBits() const {
+    return LLVMTy->getPrimitiveSizeInBits();
+  }
+
+  /// If this is a vector type, return the getPrimitiveSizeInBits value for the
+  /// element type. Otherwise return the getPrimitiveSizeInBits value for this
+  /// type.
+  unsigned getScalarSizeInBits() const { return LLVMTy->getScalarSizeInBits(); }
+
+  /// Return the width of the mantissa of this type. This is only valid on
+  /// floating-point types. If the FP type does not have a stable mantissa (e.g.
+  /// ppc long double), this method returns -1.
+  int getFPMantissaWidth() const { return LLVMTy->getFPMantissaWidth(); }
+
+  /// Return whether the type is IEEE compatible, as defined by the eponymous
+  /// method in APFloat.
+  bool isIEEE() const { return LLVMTy->isIEEE(); }
+
+  /// If this is a vector type, return the element type, otherwise return
+  /// 'this'.
+  Type *getScalarType() const;
+
+  // TODO: ADD MISSING
+
+  static Type *getInt64Ty(Context &Ctx);
+  static Type *getInt32Ty(Context &Ctx);
+  static Type *getInt16Ty(Context &Ctx);
+  static Type *getInt8Ty(Context &Ctx);
+  static Type *getInt1Ty(Context &Ctx);
+  static Type *getDoubleTy(Context &Ctx);
+  static Type *getFloatTy(Context &Ctx);
+  // TODO: missing get*
+
+  /// Get the address space of this pointer or pointer vector type.
+  inline unsigned getPointerAddressSpace() const {
+    return LLVMTy->getPointerAddressSpace();
+  }
+
+#ifndef NDEBUG
+  void dumpOS(raw_ostream &OS) { LLVMTy->print(OS); }
+  LLVM_DUMP_METHOD void dump() {
+    dumpOS(dbgs());
+    dbgs() << "\n";
+  }
+#endif // NDEBUG
+};
+
+class PointerType : public Type {
+public:
+  // TODO: add missing functions
+  static PointerType *get(Type *ElementType, unsigned AddressSpace);
+  static PointerType *get(Context &Ctx, unsigned AddressSpace);
+
+  static bool classof(const Type *From) {
+    return isa<llvm::PointerType>(From->LLVMTy);
+  }
+};
+
+class VectorType : public Type {
+public:
+  // TODO: add missing functions
+  static bool classof(const Type *From) {
+    return isa<llvm::VectorType>(From->LLVMTy);
+  }
+};
+
+class FunctionType : public Type {
+public:
+  // TODO: add missing functions
+  static bool classof(const Type *From) {
+    return isa<llvm::FunctionType>(From->LLVMTy);
+  }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_SANDBOXIR_TYPE_H
diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt
index 6c0666b186b8a6..d94f0642ccc4a1 100644
--- a/llvm/lib/SandboxIR/CMakeLists.txt
+++ b/llvm/lib/SandboxIR/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMSandboxIR
   SandboxIR.cpp
   Tracker.cpp
+  Type.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/SandboxIR
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index b5786cdafd6307..bf224b73f3bad2 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -135,6 +135,8 @@ Value::user_iterator Value::user_begin() {
 
 unsigned Value::getNumUses() const { return range_size(Val->users()); }
 
+Type *Value::getType() const { return Ctx.getType(Val->getType()); }
+
 void Value::replaceUsesWithIf(
     Value *OtherV, llvm::function_ref<bool(const Use &)> ShouldReplace) {
   assert(getType() == OtherV->getType() && "Can't replace with different type");
@@ -583,7 +585,8 @@ VAArgInst *VAArgInst::create(Value *List, Type *Ty, BBIterator WhereIt,
     Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
   else
     Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
-  auto *LLVMI = cast<llvm::VAArgInst>(Builder.CreateVAArg(List->Val, Ty, Name));
+  auto *LLVMI =
+      cast<llvm::VAArgInst>(Builder.CreateVAArg(List->Val, Ty->LLVMTy, Name));
   return Ctx.createVAArgInst(LLVMI);
 }
 
@@ -754,7 +757,7 @@ LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(BeforeIR);
   auto *NewLI =
-      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
+      Builder.CreateAlignedLoad(Ty->LLVMTy, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
@@ -771,7 +774,7 @@ LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
   auto *NewLI =
-      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
+      Builder.CreateAlignedLoad(Ty->LLVMTy, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
@@ -886,6 +889,11 @@ Value *ReturnInst::getReturnValue() const {
   return LLVMRetVal != nullptr ? Ctx.getValue(LLVMRetVal) : nullptr;
 }
 
+FunctionType *CallBase::getFunctionType() const {
+  return cast<FunctionType>(
+      Ctx.getType(cast<llvm::CallBase>(Val)->getFunctionType()));
+}
+
 Value *CallBase::getCalledOperand() const {
   return Ctx.getValue(cast<llvm::CallBase>(Val)->getCalledOperand());
 }
@@ -911,8 +919,9 @@ void CallBase::setCalledFunction(Function *F) {
   // Note: This may break if `setCalledFunction()` early returns if `F`
   // is already set, but we do have a unit test for it.
   setCalledOperand(F);
-  cast<llvm::CallBase>(Val)->setCalledFunction(F->getFunctionType(),
-                                               cast<llvm::Function>(F->Val));
+  cast<llvm::CallBase>(Val)->setCalledFunction(
+      cast<llvm::FunctionType>(F->getFunctionType()->LLVMTy),
+      cast<llvm::Function>(F->Val));
 }
 
 CallInst *CallInst::create(FunctionType *FTy, Value *Func,
@@ -928,7 +937,8 @@ CallInst *CallInst::create(FunctionType *FTy, Value *Func,
   LLVMArgs.reserve(Args.size());
   for (Value *Arg : Args)
     LLVMArgs.push_back(Arg->Val);
-  llvm::CallInst *NewCI = Builder.CreateCall(FTy, Func->Val, LLVMArgs, NameStr);
+  llvm::CallInst *NewCI = Builder.CreateCall(
+      cast<llvm::FunctionType>(FTy->LLVMTy), Func->Val, LLVMArgs, NameStr);
   return Ctx.createCallInst(NewCI);
 }
 
@@ -961,7 +971,8 @@ InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
   for (Value *Arg : Args)
     LLVMArgs.push_back(Arg->Val);
   llvm::InvokeInst *Invoke = Builder.CreateInvoke(
-      FTy, Func->Val, cast<llvm::BasicBlock>(IfNormal->Val),
+      cast<llvm::FunctionType>(FTy->LLVMTy), Func->Val,
+      cast<llvm::BasicBlock>(IfNormal->Val),
       cast<llvm::BasicBlock>(IfException->Val), LLVMArgs, NameStr);
   return Ctx.createInvokeInst(Invoke);
 }
@@ -1032,9 +1043,10 @@ CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
   for (Value *Arg : Args)
     LLVMArgs.push_back(Arg->Val);
 
-  llvm::CallBrInst *CallBr = Builder.CreateCallBr(
-      FTy, Func->Val, cast<llvm::BasicBlock>(DefaultDest->Val),
-      LLVMIndirectDests, LLVMArgs, NameStr);
+  llvm::CallBrInst *CallBr =
+      Builder.CreateCallBr(cast<llvm::FunctionType>(FTy->LLVMTy), Func->Val,
+                           cast<llvm::BasicBlock>(DefaultDest->Val),
+                           LLVMIndirectDests, LLVMArgs, NameStr);
   return Ctx.createCallBrInst(CallBr);
 }
 
@@ -1107,7 +1119,7 @@ LandingPadInst *LandingPadInst::create(Type *RetTy, unsigned NumReservedClauses,
   else
     Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
   llvm::LandingPadInst *LLVMI =
-      Builder.CreateLandingPad(RetTy, NumReservedClauses, Name);
+      Builder.CreateLandingPad(RetTy->LLVMTy, NumReservedClauses, Name);
   return Ctx.createLandingPadInst(LLVMI);
 }
 
@@ -1288,7 +1300,8 @@ Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
   LLVMIdxList.reserve(IdxList.size());
   for (Value *Idx : IdxList)
     LLVMIdxList.push_back(Idx->Val);
-  llvm::Value *NewV = Builder.CreateGEP(Ty, Ptr->Val, LLVMIdxList, NameStr);
+  llvm::Value *NewV =
+      Builder.CreateGEP(Ty->LLVMTy, Ptr->Val, LLVMIdxList, NameStr);
   if (auto *NewGEP = dyn_cast<llvm::GetElementPtrInst>(NewV))
     return Ctx.createGetElementPtrInst(NewGEP);
   assert(isa<llvm::Constant>(NewV) && "Expected constant");
@@ -1312,10 +1325,25 @@ Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
                                    InsertAtEnd, Ctx, NameStr);
 }
 
+Type *GetElementPtrInst::getSourceElementType() const {
+  return Ctx.getType(
+      cast<llvm::GetElementPtrInst>(Val)->getSourceElementType());
+}
+
+Type *GetElementPtrInst::getResultElementType() const {
+  return Ctx.getType(
+      cast<llvm::GetElementPtrInst>(Val)->getResultElementType());
+}
+
 Value *GetElementPtrInst::getPointerOperand() const {
   return Ctx.getValue(cast<llvm::GetElementPtrInst>(Val)->getPointerOperand());
 }
 
+Type *GetElementPtrInst::getPointerOperandType() const {
+  return Ctx.getType(
+      cast<llvm::GetElementPtrInst>(Val)->getPointerOperandType());
+}
+
 BasicBlock *PHINode::LLVMBBToBB::operator()(llvm::BasicBlock *LLVMBB) const {
   return cast<BasicBlock>(Ctx.getValue(LLVMBB));
 }
@@ -1323,8 +1351,9 @@ BasicBlock *PHINode::LLVMBBToBB::operator()(llvm::BasicBlock *LLVMBB) const {
 PHINode *PHINode::create(Type *Ty, unsigned NumReservedValues,
                          Instruction *InsertBefore, Context &Ctx,
                          const Twine &Name) {
-  llvm::PHINode *NewPHI = llvm::PHINode::Create(
-      Ty, NumReservedValues, Name, InsertBefore->getTopmostLLVMInstruction());
+  llvm::PHINode *NewPHI =
+      llvm::PHINode::Create(Ty->LLVMTy, NumReservedValues, Name,
+                            InsertBefore->getTopmostLLVMInstruction());
   return Ctx.createPHINode(NewPHI);
 }
 
@@ -1943,7 +1972,8 @@ AllocaInst *AllocaInst::create(Type *Ty, unsigned AddrSpace, BBIterator WhereIt,
     Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
   else
     Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
-  auto *NewAlloca = Builder.CreateAlloca(Ty, AddrSpace, ArraySize->Val, Name);
+  auto *NewAlloca =
+      Builder.CreateAlloca(Ty->LLVMTy, AddrSpace, ArraySize->Val, Name);
   return Ctx.createAllocaInst(NewAlloca);
 }
 
@@ -1961,11 +1991,15 @@ AllocaInst *AllocaInst::create(Type *Ty, unsigned AddrSpace,
                 Name);
 }
 
+Type *AllocaInst::getAllocatedType() const {
+  return Ctx.getType(cast<llvm::AllocaInst>(Val)->getAllocatedType());
+}
+
 void AllocaInst::setAllocatedType(Type *Ty) {
   Ctx.getTracker()
       .emplaceIfTracking<GenericSetter<&AllocaInst::getAllocatedType,
                                        &AllocaInst::setAllocatedType>>(this);
-  cast<llvm::AllocaInst>(Val)->setAllocatedType(Ty);
+  cast<llvm::AllocaInst>(Val)->setAllocatedType(Ty->LLVMTy);
 }
 
 void AllocaInst::setAlignment(Align Align) {
@@ -1987,6 +2021,10 @@ Value *AllocaInst::getArraySize() {
   return Ctx.getValue(cast<llvm::AllocaInst>(Val)->getArraySize());
 }
 
+PointerType *AllocaInst::getType() const {
+  return cast<PointerType>(Ctx.getType(cast<llvm::AllocaInst>(Val)->getType()));
+}
+
 Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
                         BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
                         const Twine &Name) {
@@ -1997,7 +2035,7 @@ Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
   else
     Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
   auto *NewV =
-      Builder.CreateCast(getLLVMCastOp(Op), Operand->Val, DestTy, Name);
+      Builder.CreateCast(getLLVMCastOp(Op), Operand->Val, DestTy->LLVMTy, Name);
   if (auto *NewCI = dyn_cast<llvm::CastInst>(NewV))
     return Ctx.createCastInst(NewCI);
   assert(isa<llvm::Constant>(NewV) && "Expected constant");
@@ -2022,6 +2060,14 @@ bool CastInst::classof(const Value *From) {
   return From->getSubclassID() == ClassID::Cast;
 }
 
+Type *CastInst::getSrcTy() const {
+  return Ctx.getType(cast<llvm::CastInst>(Val)->getSrcTy());
+}
+
+Type *CastInst::getDestTy() const {
+  return Ctx.getType(cast<llvm::CastInst>(Val)->getDestTy());
+}
+
 void PossiblyNonNegInst::setNonNeg(bool B) {
   Ctx.getTracker()
       .emplaceIfTracking<GenericSetter<&PossiblyNonNegInst::hasNonNeg,
@@ -2134,15 +2180,25 @@ void ShuffleVectorInst::setShuffleMask(ArrayRef<int> Mask) {
   cast<llvm::ShuffleVectorInst>(Val)->setShuffleMask(Mask);
 }
 
+VectorType *ShuffleVectorInst::getType() const {
+  return cast<VectorType>(
+      Ctx.getType(cast<llvm::ShuffleVectorInst>(Val)->getType()));
+}
+
 Constant *ShuffleVectorInst::getShuffleMaskForBitcode() const {
   return Ctx.getOrCreateConstant(
       cast<llvm::ShuffleVectorInst>(Val)->getShuffleMaskForBitcode());
 }
 
-Constant *ShuffleVectorInst::convertShuffleMaskForBitcode(
-    llvm::ArrayRef<int> Mask, llvm::Type *ResultTy, Context &Ctx) {
-  return Ctx.getOrCreateConstant(
-      llvm::ShuffleVectorInst::convertShuffleMaskForBitcode(Mask, ResultTy));
+Constant *ShuffleVectorInst::convertShuffleMaskForBitcode(ArrayRef<int> Mask,
+                                                          Type *ResultTy) {
+  return ResultTy->getContext().getOrCreateConstant(
+      llvm::ShuffleVectorInst::convertShuffleMaskForBitcode(Mask,
+                                                            ResultTy->LLVMTy));
+}
+
+VectorType *ExtractElementInst::getVectorOperandType() const {
+  return cast<VectorType>(Ctx.getType(getVectorOperand()->getType()->LLVMTy));
 }
 
 Value *ExtractValueInst::create(Value *Agg, ArrayRef<unsigned> Idxs,
@@ -2160,6 +2216,11 @@ Value *ExtractValueInst::create(Value *Agg, ArrayRef<unsigned> Idxs,
   return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
 }
 
+Type *ExtractValueInst::getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs) {
+  auto *LLVMTy = llvm::ExtractValueInst::getIndexedType(Agg->LLVMTy, Idxs);
+  return Agg->getContext().getType(LLVMTy);
+}
+
 Value *InsertValueInst::create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
                                BBIterator WhereIt, BasicBlock *WhereBB,
                                Context &Ctx, const Twine &Name) {
@@ -2182,10 +2243,14 @@ void Constant::dumpOS(raw_ostream &OS) const {
 }
 #endif // NDEBUG
 
-ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, Context &Ctx,
-                              bool IsSigned) {
-  auto *LLVMC = llvm::ConstantInt::get(Ty, V, IsSigned);
-  return cast<ConstantInt>(Ctx.getOrCreateConstant(LLVMC));
+ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, bool IsSigned) {
+  auto *LLVMC = llvm::ConstantInt::get(Ty->LLVMTy, V, IsSigned);
+  return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+
+FunctionType *Function::getFunctionType() const {
+  return cast<FunctionType>(
+      Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
new file mode 100644
index 00000000000000..6f850b82d2e996
--- /dev/null
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -0,0 +1,48 @@
+//===- Type.cpp - Sandbox IR Type -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SandboxIR/Type.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+
+using namespace llvm::sandboxir;
+
+Type *Type::getScalarType() const {
+  return Ctx.getType(LLVMTy->getScalarType());
+}
+
+Type *Type::getInt64Ty(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getInt64Ty(Ctx.LLVMCtx));
+}
+Type *Type::getInt32Ty(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getInt32Ty(Ctx.LLVMCtx));
+}
+Type *Type::getInt16Ty(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getInt16Ty(Ctx.LLVMCtx));
+}
+Type *Type::getInt8Ty(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getInt8Ty(Ctx.LLVMCtx));
+}
+Type *Type::getInt1Ty(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getInt1Ty(Ctx.LLVMCtx));
+}
+Type *Type::getDoubleTy(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getDoubleTy(Ctx.LLVMCtx));
+}
+Type *Type::getFloatTy(Context &Ctx) {
+  return Ctx.getType(llvm::Type::getFloatTy(Ctx.LLVMCtx));
+}
+
+PointerType *PointerType::get(Type *ElementType, unsigned AddressSpace) {
+  return cast<PointerType>(ElementType->getContext().getType(
+      llvm::PointerType::get(ElementType->LLVMTy, AddressSpace)));
+}
+
+PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) {
+  return cast<PointerType>(
+      Ctx.getType(llvm::PointerType::get(Ctx.LLVMCtx, AddressSpace)));
+}
diff --git a/llvm/unittests/SandboxIR/CMakeLists.txt b/llvm/unittests/SandboxIR/CMakeLists.txt
index 3f43f6337b919b..2da936bffa02bf 100644
--- a/llvm/unittests/SandboxIR/CMakeLists.txt
+++ b/llvm/unittests/SandboxIR/CMakeLists.txt
@@ -7,4 +7,5 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(SandboxIRTests
   SandboxIRTest.cpp
   TrackerTest.cpp
+  TypesTest.cpp
   )
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 8bf4b24c48ee0d..c543846eb2686e 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -121,10 +121,12 @@ define void @foo(i32 %v0) {
   auto *FortyTwo = cast<sandboxir::ConstantInt>(Add0->getOperand(1));
 
   // Check that creating an identical constant gives us the same object.
-  auto *NewCI = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 42, Ctx);
+  auto *NewCI =
+      sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 42);
   EXPECT_EQ(NewCI, FortyTwo);
   // Check new constant.
-  auto *FortyThree = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 43, Ctx);
+  auto *FortyThree =
+      sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 43);
   EXPECT_NE(FortyThree, FortyTwo);
 }
 
@@ -603,7 +605,7 @@ define void @foo(ptr %va) {
   EXPECT_EQ(sandboxir::VAArgInst::getPointerOperandIndex(),
             llvm::VAArgInst::getPointerOperandIndex());
   // Check create().
-  auto *NewVATy = Type::getInt8Ty(C);
+  auto *NewVATy = sandboxir::Type::getInt8Ty(Ctx);
   auto *NewVA = sandboxir::VAArgInst::create(Arg, NewVATy, Ret->getIterator(),
                                              Ret->getParent(), Ctx, "NewVA");
   EXPECT_EQ(NewVA->getNextNode(), Ret);
@@ -743,10 +745,10 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
   }
   {
     // Check SelectInst::create() Folded.
-    auto *False = sandboxir::ConstantInt::get(llvm::Type::getInt1Ty(C), 0, Ctx,
-                                              /*IsSigned=*/false);
+    auto *False = sandboxir::ConstantInt::get(sandboxir::Type::getInt1Ty(Ctx),
+                                              0, /*IsSigned=*/false);
     auto *FortyTwo =
-        sandboxir::ConstantInt::get(llvm::Type::getInt1Ty(C), 42, Ctx,
+        sandboxir::ConstantInt::get(sandboxir::Type::getInt1Ty(Ctx), 42,
                                     /*IsSigned=*/false);
     auto *NewSel =
         sandboxir::SelectInst::create(False, FortyTwo, FortyTwo, Ret, Ctx);
@@ -838,7 +840,7 @@ define void @foo(i8 %v0, i8 %v1, <2 x i8> %vec) {
 
   auto *LLVMArg0 = LLVMF.getArg(0);
   auto *LLVMArgVec = LLVMF.getArg(2);
-  auto *Zero = sandboxir::ConstantInt::get(Type::getInt8Ty(C), 0, Ctx);
+  auto *Zero = sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0);
   auto *LLVMZero = llvm::ConstantInt::get(Type::getInt8Ty(C), 0);
   EXPECT_EQ(
       sandboxir::InsertElementInst::isValidOperands(ArgVec, Arg0, Zero),
@@ -950,7 +952,7 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) {
   // convertShuffleMaskForBitcode
   {
     auto *C = sandboxir::ShuffleVectorInst::convertShuffleMaskForBitcode(
-        ArrayRef<int>({2, 3}), ArgV1->getType(), Ctx);
+        ArrayRef<int>({2, 3}), ArgV1->getType());
     SmallVector<int, 2> Result;
     sandboxir::ShuffleVectorInst::getShuffleMask(C, Result);
     EXPECT_THAT(Result, testing::ElementsAre(2, 3));
@@ -1271,6 +1273,12 @@ define void @foo({i32, float} %agg) {
 }
 )IR");
   Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMBB = &*LLVMF.begin();
+  auto LLVMIt = LLVMBB->begin();
+  [[maybe_unused]] auto *LLVMExtSimple =
+      cast<llvm::ExtractValueInst>(&*LLVMIt++);
+  auto *LLVMExtNested = cast<llvm::ExtractValueInst>(&*LLVMIt++);
+
   sandboxir::Context Ctx(C);
   auto &F = *Ctx.createFunction(&LLVMF);
   auto *ArgAgg = F.getArg(0);
@@ -1307,15 +1315,16 @@ define void @foo({i32, float} %agg) {
       Const1->getOperand(0), ArrayRef<unsigned>({0}), BB->end(), BB, Ctx);
   EXPECT_TRUE(isa<sandboxir::Constant>(ShouldBeConstant));
 
-  auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx);
+  auto *Zero = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0);
   EXPECT_EQ(ShouldBeConstant, Zero);
 
   // getIndexedType
-  Type *AggType = ExtNested->getAggregateOperand()->getType();
+  sandboxir::Type *AggType = ExtNested->getAggregateOperand()->getType();
+  llvm::Type *LLVMAggType = LLVMExtNested->getAggregateOperand()->getType();
   EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType(
                 AggType, ArrayRef<unsigned>({1, 0})),
-            llvm::ExtractValueInst::getIndexedType(AggType,
-                                                   ArrayRef<unsigned>({1, 0})));
+            Ctx.getType(llvm::ExtractValueInst::getIndexedType(
+                LLVMAggType, ArrayRef<unsigned>({1, 0}))));
 
   EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType(
                 AggType, ArrayRef<unsigned>({2})),
@@ -1410,7 +1419,7 @@ define void @foo({i32, float} %agg, i32 %i) {
 #endif // NDEBUG
 
   // Test the path that creates a folded constant.
-  auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx);
+  auto *Zero = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0);
   auto *ShouldBeConstant = sandboxir::InsertValueInst::create(
       Const1->getOperand(0), Zero, ArrayRef<unsigned>({0}), BB->end(), BB, Ctx);
   auto *ExpectedConstant = Const2->getOperand(0);
@@ -1811,7 +1820,8 @@ define i8 @foo(i8 %arg0, i32 %arg1, ptr %indirectFoo) {
     // Check classof(Value *).
     EXPECT_TRUE(isa<sandboxir::CallBase>((sandboxir::Value *)Call));
     // Check getFunctionType().
-    EXPECT_EQ(Call->getFunctionType(), LLVMCall->getFunctionType());
+    EXPECT_EQ(Call->getFunctionType(),
+              Ctx.getType(LLVMCall->getFunctionType()));
     // Check data_ops().
     EXPECT_EQ(range_size(Call->data_ops()), range_size(LLVMCall->data_ops()));
     auto DataOpIt = Call->data_operands_begin();
@@ -1942,7 +1952,7 @@ define i8 @foo(i8 %arg) {
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
   EXPECT_EQ(Call->getNumOperands(), 2u);
   EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
-  FunctionType *FTy = F.getFunctionType();
+  sandboxir::FunctionType *FTy = F.getFunctionType();
   SmallVector<sandboxir::Value *, 1> Args;
   Args.push_back(Arg0);
   {
@@ -2231,8 +2241,8 @@ define void @foo() {
   auto *BBRet = &*BB->begin();
   auto *NewLPad =
       cast<sandboxir::LandingPadInst>(sandboxir::LandingPadInst::create(
-          Type::getInt8Ty(C), 0, BBRet->getIterator(), BBRet->getParent(), Ctx,
-          "NewLPad"));
+          sandboxir::Type::getInt8Ty(Ctx), 0, BBRet->getIterator(),
+          BBRet->getParent(), Ctx, "NewLPad"));
   EXPECT_EQ(NewLPad->getNextNode(), BBRet);
   EXPECT_FALSE(NewLPad->isCleanup());
 #ifndef NDEBUG
@@ -2491,9 +2501,11 @@ define void @foo(ptr %ptr, <2 x ptr> %ptrs) {
     // Check classof().
     auto *GEP = cast<sandboxir::GetElementPtrInst>(Ctx.getValue(LLVMGEP));
     // Check getSourceElementType().
-    EXPECT_EQ(GEP->getSourceElementType(), LLVMGEP->getSourceElementType());
+    EXPECT_EQ(GEP->getSourceElementType(),
+              Ctx.getType(LLVMGEP->getSourceElementType()));
     // Check getResultElementType().
-    EXPECT_EQ(GEP->getResultElementType(), LLVMGEP->getResultElementType());
+    EXPECT_EQ(GEP->getResultElementType(),
+              Ctx.getType(LLVMGEP->getResultElementType()));
     // Check getAddressSpace().
     EXPECT_EQ(GEP->getAddressSpace(), LLVMGEP->getAddressSpace());
     // Check indices().
@@ -2509,7 +2521,8 @@ define void @foo(ptr %ptr, <2 x ptr> %ptrs) {
     // Check getPointerOperandIndex().
     EXPECT_EQ(GEP->getPointerOperandIndex(), LLVMGEP->getPointerOperandIndex());
     // Check getPointerOperandType().
-    EXPECT_EQ(GEP->getPointerOperandType(), LLVMGEP->getPointerOperandType());
+    EXPECT_EQ(GEP->getPointerOperandType(),
+              Ctx.getType(LLVMGEP->getPointerOperandType()));
     // Check getPointerAddressSpace().
     EXPECT_EQ(GEP->getPointerAddressSpace(), LLVMGEP->getPointerAddressSpace());
     // Check getNumIndices().
@@ -2870,8 +2883,8 @@ define void @foo(i32 %cond0, i32 %cond1) {
   Switch->setSuccessor(0, OrigSucc);
   EXPECT_EQ(Switch->getSuccessor(0), OrigSucc);
   // Check case_begin(), case_end(), CaseIt.
-  auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx);
-  auto *One = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 1, Ctx);
+  auto *Zero = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0);
+  auto *One = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 1);
   auto CaseIt = Switch->case_begin();
   {
     sandboxir::SwitchInst::CaseHandle Case = *CaseIt++;
@@ -2908,7 +2921,7 @@ define void @foo(i32 %cond0, i32 %cond1) {
   EXPECT_EQ(Switch->findCaseDest(BB1), One);
   EXPECT_EQ(Switch->findCaseDest(Entry), nullptr);
   // Check addCase().
-  auto *Two = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 2, Ctx);
+  auto *Two = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 2);
   Switch->addCase(Two, Entry);
   auto CaseTwoIt = Switch->findCaseValue(Two);
   auto CaseTwo = *CaseTwoIt;
@@ -3173,7 +3186,8 @@ define void @foo(i8 %arg0, i8 %arg1, float %farg0, float %farg1) {
   }
   {
     // Check create() when it gets folded.
-    auto *FortyTwo = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 42, Ctx);
+    auto *FortyTwo =
+        sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 42);
     auto *NewV = sandboxir::BinaryOperator::create(
         sandboxir::Instruction::Opcode::Add, FortyTwo, FortyTwo,
         /*InsertBefore=*/Ret, Ctx, "Folded");
@@ -3229,7 +3243,8 @@ define void @foo(i8 %arg0, i8 %arg1, float %farg0, float %farg1) {
   }
   {
     // Check createWithCopiedFlags() when it gets folded.
-    auto *FortyTwo = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 42, Ctx);
+    auto *FortyTwo =
+        sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 42);
     auto *NewV = sandboxir::BinaryOperator::createWithCopiedFlags(
         sandboxir::Instruction::Opcode::Add, FortyTwo, FortyTwo, CopyFrom,
         /*InsertBefore=*/Ret, Ctx, "Folded");
@@ -3651,8 +3666,8 @@ define void @foo() {
   EXPECT_EQ(AllocaArray->getArraySize(),
             Ctx.getValue(LLVMAllocaArray->getArraySize()));
   // Check getType().
-  EXPECT_EQ(AllocaScalar->getType(), LLVMAllocaScalar->getType());
-  EXPECT_EQ(AllocaArray->getType(), LLVMAllocaArray->getType());
+  EXPECT_EQ(AllocaScalar->getType(), Ctx.getType(LLVMAllocaScalar->getType()));
+  EXPECT_EQ(AllocaArray->getType(), Ctx.getType(LLVMAllocaArray->getType()));
   // Check getAddressSpace().
   EXPECT_EQ(AllocaScalar->getAddressSpace(),
             LLVMAllocaScalar->getAddressSpace());
@@ -3669,12 +3684,12 @@ define void @foo() {
             LLVMAllocaArray->getAllocationSizeInBits(DL));
   // Check getAllocatedType().
   EXPECT_EQ(AllocaScalar->getAllocatedType(),
-            LLVMAllocaScalar->getAllocatedType());
+            Ctx.getType(LLVMAllocaScalar->getAllocatedType()));
   EXPECT_EQ(AllocaArray->getAllocatedType(),
-            LLVMAllocaArray->getAllocatedType());
+            Ctx.getType(LLVMAllocaArray->getAllocatedType()));
   // Check setAllocatedType().
   auto *OrigType = AllocaScalar->getAllocatedType();
-  auto *NewType = PointerType::get(C, 0);
+  auto *NewType = sandboxir::PointerType::get(Ctx, 0);
   EXPECT_NE(NewType, OrigType);
   AllocaScalar->setAllocatedType(NewType);
   EXPECT_EQ(AllocaScalar->getAllocatedType(), NewType);
@@ -3705,10 +3720,10 @@ define void @foo() {
   AllocaScalar->setUsedWithInAlloca(OrigUsedWithInAlloca);
   EXPECT_EQ(AllocaScalar->isUsedWithInAlloca(), OrigUsedWithInAlloca);
 
-  auto *Ty = Type::getInt32Ty(C);
+  auto *Ty = sandboxir::Type::getInt32Ty(Ctx);
   unsigned AddrSpace = 42;
-  auto *PtrTy = PointerType::get(C, AddrSpace);
-  auto *ArraySize = sandboxir::ConstantInt::get(Ty, 43, Ctx);
+  auto *PtrTy = sandboxir::PointerType::get(Ctx, AddrSpace);
+  auto *ArraySize = sandboxir::ConstantInt::get(Ty, 43);
   {
     // Check create() WhereIt, WhereBB.
     auto *NewI = cast<sandboxir::AllocaInst>(sandboxir::AllocaInst::create(
@@ -3785,13 +3800,13 @@ define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
 
-  Type *Ti64 = Type::getInt64Ty(C);
-  Type *Ti32 = Type::getInt32Ty(C);
-  Type *Ti16 = Type::getInt16Ty(C);
-  Type *Tdouble = Type::getDoubleTy(C);
-  Type *Tfloat = Type::getFloatTy(C);
-  Type *Tptr = Tfloat->getPointerTo();
-  Type *Tptr1 = Tfloat->getPointerTo(1);
+  auto *Ti64 = sandboxir::Type::getInt64Ty(Ctx);
+  auto *Ti32 = sandboxir::Type::getInt32Ty(Ctx);
+  auto *Ti16 = sandboxir::Type::getInt16Ty(Ctx);
+  auto *Tdouble = sandboxir::Type::getDoubleTy(Ctx);
+  auto *Tfloat = sandboxir::Type::getFloatTy(Ctx);
+  auto *Tptr = sandboxir::PointerType::get(Tfloat, 0);
+  auto *Tptr1 = sandboxir::PointerType::get(Tfloat, 1);
 
   // Check classof(), getOpcode(), getSrcTy(), getDstTy()
   auto *ZExt = cast<sandboxir::CastInst>(&*It++);
@@ -4003,10 +4018,13 @@ define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
 /// CastInst's subclasses are very similar so we can use a common test function
 /// for them.
 template <typename SubclassT, sandboxir::Instruction::Opcode OpcodeT>
-void testCastInst(llvm::Module &M, Type *SrcTy, Type *DstTy) {
+void testCastInst(llvm::Module &M, llvm::Type *LLVMSrcTy,
+                  llvm::Type *LLVMDstTy) {
   Function &LLVMF = *M.getFunction("foo");
   sandboxir::Context Ctx(M.getContext());
   sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  sandboxir::Type *SrcTy = Ctx.getType(LLVMSrcTy);
+  sandboxir::Type *DstTy = Ctx.getType(LLVMDstTy);
   unsigned ArgIdx = 0;
   auto *Arg = F->getArg(ArgIdx++);
   auto *BB = &*F->begin();
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index ca6effb727bf37..c189100fbd6947 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -938,9 +938,10 @@ define void @foo(i32 %cond0, i32 %cond1) {
   Ctx.revert();
   EXPECT_EQ(Switch->getSuccessor(0), OrigSucc);
   // Check addCase().
-  auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx);
-  auto *One = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 1, Ctx);
-  auto *FortyTwo = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 42, Ctx);
+  auto *Zero = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0);
+  auto *One = sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 1);
+  auto *FortyTwo =
+      sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 42);
   Ctx.save();
   Switch->addCase(FortyTwo, Entry);
   EXPECT_EQ(Switch->getNumCases(), 3u);
@@ -1187,7 +1188,7 @@ define void @foo(i8 %arg) {
   // Check setAllocatedType().
   Ctx.save();
   auto *OrigTy = Alloca->getAllocatedType();
-  auto *NewTy = Type::getInt64Ty(C);
+  auto *NewTy = sandboxir::Type::getInt64Ty(Ctx);
   EXPECT_NE(NewTy, OrigTy);
   Alloca->setAllocatedType(NewTy);
   EXPECT_EQ(Alloca->getAllocatedType(), NewTy);
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
new file mode 100644
index 00000000000000..cd9e14dced7fe8
--- /dev/null
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -0,0 +1,253 @@
+//===- TypesTest.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+struct SandboxTypeTest : public testing::Test {
+  LLVMContext C;
+  std::unique_ptr<Module> M;
+
+  void parseIR(LLVMContext &C, const char *IR) {
+    SMDiagnostic Err;
+    M = parseAssemblyString(IR, Err, C);
+    if (!M)
+      Err.print("SandboxTypeTest", errs());
+  }
+  BasicBlock *getBasicBlockByName(Function &F, StringRef Name) {
+    for (BasicBlock &BB : F)
+      if (BB.getName() == Name)
+        return &BB;
+    llvm_unreachable("Expected to find basic block!");
+  }
+};
+
+TEST_F(SandboxTypeTest, Type) {
+  parseIR(C, R"IR(
+define void @foo(i32 %v0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  sandboxir::Type *I32Ty = F->getArg(0)->getType();
+
+  auto *LLVMInt32Ty = llvm::Type::getInt32Ty(C);
+  auto *LLVMFloatTy = llvm::Type::getFloatTy(C);
+  auto *LLVMInt8Ty = llvm::Type::getInt8Ty(C);
+
+  auto *Int32Ty = Ctx.getType(LLVMInt32Ty);
+  auto *FloatTy = Ctx.getType(LLVMFloatTy);
+
+  // Check print().
+  std::string Buff1;
+  raw_string_ostream BS1(Buff1);
+  Int32Ty->print(BS1, /*IsForDebug=*/true, /*NoDetails=*/false);
+  std::string Buff2;
+  raw_string_ostream BS2(Buff2);
+  LLVMInt32Ty->print(BS2, /*IsForDebug=*/true, /*NoDetails=*/false);
+  EXPECT_EQ(Buff1, Buff2);
+
+  // Check getContext().
+  EXPECT_EQ(&I32Ty->getContext(), &Ctx);
+  // Check that Ctx.getType(nullptr) == nullptr.
+  EXPECT_EQ(Ctx.getType(nullptr), nullptr);
+
+#define CHK(LLVMCreate, SBCheck)                                               \
+  Ctx.getType(llvm::Type::LLVMCreate(C))->SBCheck()
+  // Check isVoidTy().
+  EXPECT_TRUE(Ctx.getType(llvm::Type::getVoidTy(C))->isVoidTy());
+  EXPECT_TRUE(CHK(getVoidTy, isVoidTy));
+  // Check isHalfTy().
+  EXPECT_TRUE(CHK(getHalfTy, isHalfTy));
+  // Check isBFloatTy().
+  EXPECT_TRUE(CHK(getBFloatTy, isBFloatTy));
+  // Check is16bitFPTy().
+  EXPECT_TRUE(CHK(getHalfTy, is16bitFPTy));
+  // Check isFloatTy().
+  EXPECT_TRUE(CHK(getFloatTy, isFloatTy));
+  // Check isDoubleTy().
+  EXPECT_TRUE(CHK(getDoubleTy, isDoubleTy));
+  // Check isX86_FP80Ty().
+  EXPECT_TRUE(CHK(getX86_FP80Ty, isX86_FP80Ty));
+  // Check isFP128Ty().
+  EXPECT_TRUE(CHK(getFP128Ty, isFP128Ty));
+  // Check isPPC_FP128Ty().
+  EXPECT_TRUE(CHK(getPPC_FP128Ty, isPPC_FP128Ty));
+  // Check isIEEELikeFPTy().
+  EXPECT_TRUE(CHK(getFloatTy, isIEEELikeFPTy));
+  // Check isFloatingPointTy().
+  EXPECT_TRUE(CHK(getFloatTy, isFloatingPointTy));
+  EXPECT_TRUE(CHK(getDoubleTy, isFloatingPointTy));
+  // Check isMultiUnitFPType().
+  EXPECT_TRUE(CHK(getPPC_FP128Ty, isMultiUnitFPType));
+  EXPECT_FALSE(CHK(getFloatTy, isMultiUnitFPType));
+  // Check getFltSemantics().
+  EXPECT_EQ(&sandboxir::Type::getFloatTy(Ctx)->getFltSemantics(),
+            &llvm::Type::getFloatTy(C)->getFltSemantics());
+  // Check isX86_AMXTy().
+  EXPECT_TRUE(CHK(getX86_AMXTy, isX86_AMXTy));
+  // Check isTargetExtTy().
+  EXPECT_TRUE(Ctx.getType(llvm::TargetExtType::get(C, "foo"))->isTargetExtTy());
+  // Check isScalableTargetExtTy().
+  EXPECT_TRUE(Ctx.getType(llvm::TargetExtType::get(C, "aarch64.svcount"))
+                  ->isScalableTargetExtTy());
+  // Check isScalableTy().
+  EXPECT_TRUE(Ctx.getType(llvm::ScalableVectorType::get(LLVMInt32Ty, 2u))
+                  ->isScalableTy());
+  // Check isFPOrFPVectorTy().
+  EXPECT_TRUE(CHK(getFloatTy, isFPOrFPVectorTy));
+  EXPECT_FALSE(CHK(getInt32Ty, isFPOrFPVectorTy));
+  // Check isLabelTy().
+  EXPECT_TRUE(CHK(getLabelTy, isLabelTy));
+  // Check isMetadataTy().
+  EXPECT_TRUE(CHK(getMetadataTy, isMetadataTy));
+  // Check isTokenTy().
+  EXPECT_TRUE(CHK(getTokenTy, isTokenTy));
+  // Check isIntegerTy().
+  EXPECT_TRUE(CHK(getInt32Ty, isIntegerTy));
+  EXPECT_FALSE(CHK(getFloatTy, isIntegerTy));
+  // Check isIntegerTy(Bitwidth).
+  EXPECT_TRUE(LLVMInt32Ty->isIntegerTy(32u));
+  EXPECT_FALSE(LLVMInt32Ty->isIntegerTy(31u));
+  EXPECT_FALSE(Ctx.getType(llvm::Type::getFloatTy(C))->isIntegerTy(32u));
+  // Check isIntOrIntVectorTy().
+  EXPECT_TRUE(LLVMInt32Ty->isIntOrIntVectorTy());
+  EXPECT_TRUE(Ctx.getType(llvm::FixedVectorType::get(LLVMInt32Ty, 8))
+                  ->isIntOrIntVectorTy());
+  EXPECT_FALSE(Ctx.getType(LLVMFloatTy)->isIntOrIntVectorTy());
+  EXPECT_FALSE(Ctx.getType(llvm::FixedVectorType::get(LLVMFloatTy, 8))
+                   ->isIntOrIntVectorTy());
+  // Check isIntOrPtrTy().
+  EXPECT_TRUE(Int32Ty->isIntOrPtrTy());
+  EXPECT_TRUE(Ctx.getType(llvm::PointerType::get(C, 0u))->isIntOrPtrTy());
+  EXPECT_FALSE(FloatTy->isIntOrPtrTy());
+  // Check isFunctionTy().
+  EXPECT_TRUE(Ctx.getType(llvm::FunctionType::get(LLVMInt32Ty, {}, false))
+                  ->isFunctionTy());
+  // Check isStructTy().
+  EXPECT_TRUE(Ctx.getType(llvm::StructType::get(C))->isStructTy());
+  // Check isArrayTy().
+  EXPECT_TRUE(Ctx.getType(llvm::ArrayType::get(LLVMInt32Ty, 10))->isArrayTy());
+  // Check isPointerTy().
+  EXPECT_TRUE(Ctx.getType(llvm::PointerType::get(C, 0u))->isPointerTy());
+  // Check isPtrOrPtrVectroTy().
+  EXPECT_TRUE(
+      Ctx.getType(llvm::FixedVectorType::get(llvm::PointerType::get(C, 0u), 8u))
+          ->isPtrOrPtrVectorTy());
+  // Check isVectorTy().
+  EXPECT_TRUE(
+      Ctx.getType(llvm::FixedVectorType::get(LLVMInt32Ty, 8u))->isVectorTy());
+  // Check canLosslesslyBitCastTo().
+  auto *VecTy32x4 = Ctx.getType(llvm::FixedVectorType::get(LLVMInt32Ty, 4u));
+  auto *VecTy32x2 = Ctx.getType(llvm::FixedVectorType::get(LLVMInt32Ty, 2u));
+  auto *VecTy8x16 = Ctx.getType(llvm::FixedVectorType::get(LLVMInt8Ty, 16u));
+  EXPECT_TRUE(VecTy32x4->canLosslesslyBitCastTo(VecTy8x16));
+  EXPECT_FALSE(VecTy32x4->canLosslesslyBitCastTo(VecTy32x2));
+  // Check isEmptyTy().
+  EXPECT_TRUE(Ctx.getType(llvm::StructType::get(C))->isEmptyTy());
+  // Check isFirstClassType().
+  EXPECT_TRUE(Int32Ty->isFirstClassType());
+  // Check isSingleValueType().
+  EXPECT_TRUE(Int32Ty->isSingleValueType());
+  // Check isAggregateType().
+  EXPECT_FALSE(Int32Ty->isAggregateType());
+  // Check isSized().
+  SmallPtrSet<sandboxir::Type *, 1> Visited;
+  EXPECT_TRUE(Int32Ty->isSized(&Visited));
+  // Check getPrimitiveSizeInBits().
+  EXPECT_EQ(VecTy32x2->getPrimitiveSizeInBits(), 32u * 2);
+  // Check getScalarSizeInBits().
+  EXPECT_EQ(VecTy32x2->getScalarSizeInBits(), 32u);
+  // Check getFPMantissaWidth().
+  EXPECT_EQ(FloatTy->getFPMantissaWidth(), LLVMFloatTy->getFPMantissaWidth());
+  // Check isIEEE().
+  EXPECT_EQ(FloatTy->isIEEE(), LLVMFloatTy->isIEEE());
+  // Check getScalarType().
+  EXPECT_EQ(
+      Ctx.getType(llvm::FixedVectorType::get(LLVMInt32Ty, 8u))->getScalarType(),
+      Int32Ty);
+
+#define CHK_GET(TY)                                                            \
+  EXPECT_EQ(Ctx.getType(llvm::Type::get##TY##Ty(C)),                           \
+            sandboxir::Type::get##TY##Ty(Ctx))
+  // Check getInt64Ty().
+  CHK_GET(Int64);
+  // Check getInt32Ty().
+  CHK_GET(Int32);
+  // Check getInt16Ty().
+  CHK_GET(Int16);
+  // Check getInt8Ty().
+  CHK_GET(Int8);
+  // Check getInt1Ty().
+  CHK_GET(Int1);
+  // Check getDoubleTy().
+  CHK_GET(Double);
+  // Check getFloatTy().
+  CHK_GET(Float);
+}
+
+TEST_F(SandboxTypeTest, PointerType) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation.
+  auto *PtrTy = cast<sandboxir::PointerType>(F->getArg(0)->getType());
+  // Check get(ElementType, AddressSpace).
+  auto *NewPtrTy =
+      sandboxir::PointerType::get(sandboxir::Type::getInt32Ty(Ctx), 0u);
+  EXPECT_EQ(NewPtrTy, PtrTy);
+  // Check get(Ctx, AddressSpace).
+  auto *NewPtrTy2 = sandboxir::PointerType::get(Ctx, 0u);
+  EXPECT_EQ(NewPtrTy2, PtrTy);
+}
+
+TEST_F(SandboxTypeTest, VectorType) {
+  parseIR(C, R"IR(
+define void @foo(<2 x i8> %v0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation.
+  [[maybe_unused]] auto *VecTy =
+      cast<sandboxir::VectorType>(F->getArg(0)->getType());
+}
+
+TEST_F(SandboxTypeTest, FunctionType) {
+  parseIR(C, R"IR(
+define void @foo() {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation.
+  [[maybe_unused]] auto *FTy =
+      cast<sandboxir::FunctionType>(F->getFunctionType());
+}

From 46fe36a4295f05d5d3731762e31fc4e6e99863e9 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Thu, 29 Aug 2024 17:29:01 -0700
Subject: [PATCH 69/72] Revert "[compiler-rt] Remove duplicates of
 sanitizer_common functions"

This works for MinGW, but the MSVC linker apparently doens't pull in
those symbols. Reverting for now since I won't be able to reproduce it today.

https://lab.llvm.org/buildbot/#/builders/107/builds/2337

This reverts commit 9df92cbd1addb03c7169f05cf3b628f88c610224.
---
 .../lib/interception/interception_win.cpp     | 56 +++++++++++++++----
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 0a9f2b62a78a58..a638e66eccee58 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -127,11 +127,9 @@
 #include "interception.h"
 
 #if SANITIZER_WINDOWS
-#  include "sanitizer_common/sanitizer_common.h"
-#  include "sanitizer_common/sanitizer_libc.h"
-#  include "sanitizer_common/sanitizer_platform.h"
-#  define WIN32_LEAN_AND_MEAN
-#  include <windows.h>
+#include "sanitizer_common/sanitizer_platform.h"
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
 
 namespace __interception {
 
@@ -188,6 +186,40 @@ static uptr GetMmapGranularity() {
   return si.dwAllocationGranularity;
 }
 
+UNUSED static uptr RoundUpTo(uptr size, uptr boundary) {
+  return (size + boundary - 1) & ~(boundary - 1);
+}
+
+// FIXME: internal_str* and internal_mem* functions should be moved from the
+// ASan sources into interception/.
+
+static size_t _strlen(const char *str) {
+  const char* p = str;
+  while (*p != '\0') ++p;
+  return p - str;
+}
+
+static char* _strchr(char* str, char c) {
+  while (*str) {
+    if (*str == c)
+      return str;
+    ++str;
+  }
+  return nullptr;
+}
+
+static void _memset(void *p, int value, size_t sz) {
+  for (size_t i = 0; i < sz; ++i)
+    ((char*)p)[i] = (char)value;
+}
+
+static void _memcpy(void *dst, void *src, size_t sz) {
+  char *dst_c = (char*)dst,
+       *src_c = (char*)src;
+  for (size_t i = 0; i < sz; ++i)
+    dst_c[i] = src_c[i];
+}
+
 static bool ChangeMemoryProtection(
     uptr address, uptr size, DWORD *old_protection) {
   return ::VirtualProtect((void*)address, size,
@@ -234,7 +266,7 @@ static bool FunctionHasPadding(uptr address, uptr size) {
 }
 
 static void WritePadding(uptr from, uptr size) {
-  internal_memset((void *)from, 0xCC, (size_t)size);
+  _memset((void*)from, 0xCC, (size_t)size);
 }
 
 static void WriteJumpInstruction(uptr from, uptr target) {
@@ -705,8 +737,8 @@ static bool CopyInstructions(uptr to, uptr from, size_t size) {
     size_t instruction_size = GetInstructionSize(from + cursor, &rel_offset);
     if (!instruction_size)
       return false;
-    internal_memcpy((void *)(to + cursor), (void *)(from + cursor),
-                    (size_t)instruction_size);
+    _memcpy((void *)(to + cursor), (void *)(from + cursor),
+            (size_t)instruction_size);
     if (rel_offset) {
 #  if SANITIZER_WINDOWS64
       // we want to make sure that the new relative offset still fits in 32-bits
@@ -995,7 +1027,7 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
 
   for (DWORD i = 0; i < exports->NumberOfNames; i++) {
     RVAPtr<char> name(module, names[i]);
-    if (!internal_strcmp(func_name, name)) {
+    if (!strcmp(func_name, name)) {
       DWORD index = ordinals[i];
       RVAPtr<char> func(module, functions[index]);
 
@@ -1007,13 +1039,13 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
         // format: "<module> . <function_name>" that is stored into the
         // exported directory.
         char function_name[256];
-        size_t funtion_name_length = internal_strlen(func);
+        size_t funtion_name_length = _strlen(func);
         if (funtion_name_length >= sizeof(function_name) - 1)
           InterceptionFailed();
 
-        internal_memcpy(function_name, func, funtion_name_length);
+        _memcpy(function_name, func, funtion_name_length);
         function_name[funtion_name_length] = '\0';
-        char *separator = internal_strchr(function_name, '.');
+        char* separator = _strchr(function_name, '.');
         if (!separator)
           InterceptionFailed();
         *separator = '\0';

From cdaebf6f0e2be65f55011ccb2f1bc3b9199b6285 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli@nvidia.com>
Date: Thu, 29 Aug 2024 18:18:51 -0700
Subject: [PATCH 70/72] [NVPTX] Fix crash caused by ComputePTXValueVTs
 (#104524)

When [lowering return
values](https://github.com/llvm/llvm-project/blob/99a10f1fe8a7e4b0fdb4c6dd5e7f24f87e0d3695/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L3422)
from LLVM IR to SelectionDAG, we check that [the number of values
`SelectionDAG` tells us to return is equal to the number of values that
`ComputePTXValueVTs()` tells us to
return](https://github.com/llvm/llvm-project/blob/99a10f1fe8a7e4b0fdb4c6dd5e7f24f87e0d3695/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L3441).
However, this check can fail on valid IR. For example:

```
define <6 x half> @foo() {
  ret <6 x half> zeroinitializer
}
```

`ComputePTXValueVTs()` tells us to return ***3*** `v2f16` values, while
`SelectionDAG` tells us to return ***6*** `f16` values. Thus, the
compiler will crash.

`ComputePTXValueVTs()` [supports all `half` element vectors with an even
number of
elements](https://github.com/llvm/llvm-project/blob/99a10f1fe8a7e4b0fdb4c6dd5e7f24f87e0d3695/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L213).
Whereas `SelectionDAG` [only supports power-of-2 sized
vectors](https://github.com/llvm/llvm-project/blob/4e078e3797098daa40d254447c499bcf61415308/llvm/lib/CodeGen/TargetLoweringBase.cpp#L1580).
This is the root of the discrepancy.

Assuming that the developers who added the code to
`ComputePTXValueVTs()` overlooked this, I've restricted
`ComputePTXValueVTs()` to compute the same number of return values as
`SelectionDAG`, instead of extending `SelectionDAG` to support
non-power-of-2 sized vectors.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  16 +-
 .../CodeGen/NVPTX/compute-ptx-value-vts.ll    |  61 ++
 llvm/test/CodeGen/NVPTX/vector-returns.ll     | 520 ++++++++++++++++++
 3 files changed, 592 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/vector-returns.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3fcae6b0e2875b..bb76ffdfd99d7b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -207,10 +207,15 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
       EVT EltVT = VT.getVectorElementType();
-      // Vectors with an even number of f16 elements will be passed to
-      // us as an array of v2f16/v2bf16 elements. We must match this so we
-      // stay in sync with Ins/Outs.
-      if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
+      // We require power-of-2 sized vectors becuase
+      // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
+      // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
+      // vectors.
+      if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
+          isPowerOf2_32(NumElts)) {
+        // Vectors with an even number of f16 elements will be passed to
+        // us as an array of v2f16/v2bf16 elements. We must match this so we
+        // stay in sync with Ins/Outs.
         switch (EltVT.getSimpleVT().SimpleTy) {
         case MVT::f16:
           EltVT = MVT::v2f16;
@@ -226,7 +231,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
         }
         NumElts /= 2;
       } else if (EltVT.getSimpleVT() == MVT::i8 &&
-                 (NumElts % 4 == 0 || NumElts == 3)) {
+                 ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) ||
+                  NumElts == 3)) {
         // v*i8 are formally lowered as v4i8
         EltVT = MVT::v4i8;
         NumElts = (NumElts + 3) / 4;
diff --git a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
new file mode 100644
index 00000000000000..a88c5637f089b1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-nvidia-cuda"
+
+define <6 x half> @half6() {
+; CHECK-LABEL: half6(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b16 %rs1, 0x0000;
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b16 [func_retval0+8], {%rs1, %rs1};
+; CHECK-NEXT:    ret;
+  ret <6 x half> zeroinitializer
+}
+
+define <10 x half> @half10() {
+; CHECK-LABEL: half10(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b16 %rs1, 0x0000;
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b16 [func_retval0+16], {%rs1, %rs1};
+; CHECK-NEXT:    ret;
+  ret <10 x half> zeroinitializer
+}
+
+define <12 x i8> @byte12() {
+; CHECK-LABEL: byte12(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    ret;
+  ret <12 x i8> zeroinitializer
+}
+
+define <20 x i8> @byte20() {
+; CHECK-LABEL: byte20(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+16], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    ret;
+  ret <20 x i8> zeroinitializer
+}
diff --git a/llvm/test/CodeGen/NVPTX/vector-returns.ll b/llvm/test/CodeGen/NVPTX/vector-returns.ll
new file mode 100644
index 00000000000000..0d2ad2c9bee750
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/vector-returns.ll
@@ -0,0 +1,520 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-nvidia-cuda"
+
+define <3 x i64> @long3() {
+; CHECK-LABEL: long3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd1, %rd1};
+; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd1;
+; CHECK-NEXT:    ret;
+  ret <3 x i64> zeroinitializer
+}
+
+define <2 x i64> @long2() {
+; CHECK-LABEL: long2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd1, %rd1};
+; CHECK-NEXT:    ret;
+  ret <2 x i64> zeroinitializer
+}
+
+define <1 x i64> @long1() {
+; CHECK-LABEL: long1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    st.param.b64 [func_retval0+0], %rd1;
+; CHECK-NEXT:    ret;
+  ret <1 x i64> zeroinitializer
+}
+
+define <5 x i32> @int5() {
+; CHECK-LABEL: int5(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1};
+; CHECK-NEXT:    st.param.b32 [func_retval0+16], %r1;
+; CHECK-NEXT:    ret;
+  ret <5 x i32> zeroinitializer
+}
+
+define <4 x i32> @int4() {
+; CHECK-LABEL: int4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <4 x i32> zeroinitializer
+}
+
+define <3 x i32> @int3() {
+; CHECK-LABEL: int3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r1, %r1};
+; CHECK-NEXT:    st.param.b32 [func_retval0+8], %r1;
+; CHECK-NEXT:    ret;
+  ret <3 x i32> zeroinitializer
+}
+
+define <2 x i32> @int2() {
+; CHECK-LABEL: int2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <2 x i32> zeroinitializer
+}
+
+define <1 x i32> @int1() {
+; CHECK-LABEL: int1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  ret <1 x i32> zeroinitializer
+}
+
+define <9 x i16> @short9() {
+; CHECK-LABEL: short9(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b16 [func_retval0+16], %rs1;
+; CHECK-NEXT:    ret;
+  ret <9 x i16> zeroinitializer
+}
+
+define <8 x i16> @short8() {
+; CHECK-LABEL: short8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <8 x i16> zeroinitializer
+}
+
+define <7 x i16> @short7() {
+; CHECK-LABEL: short7(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b16 [func_retval0+8], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b16 [func_retval0+12], %rs1;
+; CHECK-NEXT:    ret;
+  ret <7 x i16> zeroinitializer
+}
+
+define <5 x i16> @short5() {
+; CHECK-LABEL: short5(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b16 [func_retval0+8], %rs1;
+; CHECK-NEXT:    ret;
+  ret <5 x i16> zeroinitializer
+}
+
+define <4 x i16> @short4() {
+; CHECK-LABEL: short4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <4 x i16> zeroinitializer
+}
+
+define <3 x i16> @short3() {
+; CHECK-LABEL: short3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v2.b16 [func_retval0+0], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b16 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
+  ret <3 x i16> zeroinitializer
+}
+
+define <2 x i16> @short2() {
+; CHECK-LABEL: short2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  ret <2 x i16> zeroinitializer
+}
+
+define <1 x i16> @short1() {
+; CHECK-LABEL: short1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b16 [func_retval0+0], %rs1;
+; CHECK-NEXT:    ret;
+  ret <1 x i16> zeroinitializer
+}
+
+define <17 x i8> @byte17() {
+; CHECK-LABEL: byte17(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rs1;
+; CHECK-NEXT:    ret;
+  ret <17 x i8> zeroinitializer
+}
+
+define <16 x i8> @byte16() {
+; CHECK-LABEL: byte16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <16 x i8> zeroinitializer
+}
+
+define <15 x i8> @byte15() {
+; CHECK-LABEL: byte15(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+12], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rs1;
+; CHECK-NEXT:    ret;
+  ret <15 x i8> zeroinitializer
+}
+
+define <9 x i8> @byte9() {
+; CHECK-LABEL: byte9(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT:    ret;
+  ret <9 x i8> zeroinitializer
+}
+
+define <8 x i8> @byte8() {
+; CHECK-LABEL: byte8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r1, %r1};
+; CHECK-NEXT:    ret;
+  ret <8 x i8> zeroinitializer
+}
+
+define <7 x i8> @byte7() {
+; CHECK-LABEL: byte7(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rs1;
+; CHECK-NEXT:    ret;
+  ret <7 x i8> zeroinitializer
+}
+
+define <5 x i8> @byte5() {
+; CHECK-LABEL: byte5(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
+  ret <5 x i8> zeroinitializer
+}
+
+define <4 x i8> @byte4() {
+; CHECK-LABEL: byte4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  ret <4 x i8> zeroinitializer
+}
+
+define <3 x i8> @byte3() {
+; CHECK-LABEL: byte3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  ret <3 x i8> zeroinitializer
+}
+
+; FIXME: This test causes a crash. 
+; define <2 x i8> @byte2() {
+;   ret <2 x i8> zeroinitializer
+; }
+
+define <1 x i8> @byte1() {
+; CHECK-LABEL: byte1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    ret;
+  ret <1 x i8> zeroinitializer
+}
+
+define <17 x i1> @bit17() {
+; CHECK-LABEL: bit17(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rs1;
+; CHECK-NEXT:    ret;
+  ret <17 x i1> zeroinitializer
+}
+
+define <16 x i1> @bit16() {
+; CHECK-LABEL: bit16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+0], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+6], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+8], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+10], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+12], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+14], {%rs1, %rs1};
+; CHECK-NEXT:    ret;
+  ret <16 x i1> zeroinitializer
+}
+
+define <15 x i1> @bit15() {
+; CHECK-LABEL: bit15(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+0], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+6], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+8], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+10], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+12], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rs1;
+; CHECK-NEXT:    ret;
+  ret <15 x i1> zeroinitializer
+}
+
+define <9 x i1> @bit9() {
+; CHECK-LABEL: bit9(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+0], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.v2.b8 [func_retval0+6], {%rs1, %rs1};
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT:    ret;
+  ret <9 x i1> zeroinitializer
+}
+
+define <8 x i1> @bit8() {
+; CHECK-LABEL: bit8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rs1;
+; CHECK-NEXT:    ret;
+  ret <8 x i1> zeroinitializer
+}
+
+define <7 x i1> @bit7() {
+; CHECK-LABEL: bit7(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rs1;
+; CHECK-NEXT:    ret;
+  ret <7 x i1> zeroinitializer
+}
+
+define <5 x i1> @bit5() {
+; CHECK-LABEL: bit5(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
+; CHECK-NEXT:    ret;
+  ret <5 x i1> zeroinitializer
+}
+
+define <4 x i1> @bit4() {
+; CHECK-LABEL: bit4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs1;
+; CHECK-NEXT:    ret;
+  ret <4 x i1> zeroinitializer
+}
+
+define <3 x i1> @bit3() {
+; CHECK-LABEL: bit3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
+; CHECK-NEXT:    ret;
+  ret <3 x i1> zeroinitializer
+}
+
+define <2 x i1> @bit2() {
+; CHECK-LABEL: bit2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
+; CHECK-NEXT:    ret;
+  ret <2 x i1> zeroinitializer
+}
+
+define <1 x i1> @bit1() {
+; CHECK-LABEL: bit1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    st.param.b8 [func_retval0+0], %rs1;
+; CHECK-NEXT:    ret;
+  ret <1 x i1> zeroinitializer
+}

From 1991aa6b48845f31ff4e69a960b04086ff68ce3e Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 29 Aug 2024 18:28:09 -0700
Subject: [PATCH 71/72] Reapply "[nfc][mlgo] Incrementally update
 DominatorTreeAnalysis in FunctionPropertiesAnalysis (#104867) (#106309)

Reverts c992690179eb5de6efe47d5c8f3a23f2302723f2.

The problem is that if there is a sequence "{delete A->B} {delete A->B}
{insert A->B}" the net result is "{delete A->B}", which is not what we
want.

Duplicate successors may happen in cases like switch statements (as
shown in the unit test).

The second problem was that in `invoke` cases, some edges we speculate may get deleted don't, but are also not reachable from the inlined call site's basic block. We just need to check which edges are actually not present anymore.

The fix is to sanitize the list of deletes, just like we do for inserts.
---
 .../Analysis/FunctionPropertiesAnalysis.h     |   6 +
 .../Analysis/FunctionPropertiesAnalysis.cpp   |  56 ++++++++-
 llvm/lib/Analysis/MLInlineAdvisor.cpp         |   1 -
 .../FunctionPropertiesAnalysisTest.cpp        | 115 ++++++++++++++++++
 4 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index ee447d3e4ebb6a..af72f6e0f90b11 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -186,7 +187,12 @@ class FunctionPropertiesUpdater {
   static bool isUpdateValid(Function &F, const FunctionPropertiesInfo &FPI,
                             FunctionAnalysisManager &FAM);
 
+  DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const;
+
   DenseSet<const BasicBlock *> Successors;
+
+  // Edges we might potentially need to remove from the dominator tree.
+  SmallVector<DominatorTree::UpdateType, 2> DomTreeUpdates;
 };
 } // namespace llvm
 #endif // LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 6d6ec6c7b1cc76..af6574052c8c89 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -326,6 +326,20 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
   // with the CB BB ('Entry') between which the inlined callee will be pasted.
   Successors.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB));
 
+  // the outcome of the inlining may be that some edges get lost (DCEd BBs
+  // because inlining brought some constant, for example). We don't know which
+  // edges will be removed, so we list all of them as potentially removable.
+  // Some BBs have (at this point) duplicate edges. Remove duplicates, otherwise
+  // the DT updater will not apply changes correctly.
+  DenseSet<const BasicBlock *> Inserted;
+  for (auto *Succ : successors(&CallSiteBB))
+    if (Inserted.insert(Succ).second)
+      DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                                  const_cast<BasicBlock *>(&CallSiteBB),
+                                  const_cast<BasicBlock *>(Succ));
+  // Reuse Inserted (which has some allocated capacity at this point) below, if
+  // we have an invoke.
+  Inserted.clear();
   // Inlining only handles invoke and calls. If this is an invoke, and inlining
   // it pulls another invoke, the original landing pad may get split, so as to
   // share its content with other potential users. So the edge up to which we
@@ -336,6 +350,12 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
   if (const auto *II = dyn_cast<InvokeInst>(&CB)) {
     const auto *UnwindDest = II->getUnwindDest();
     Successors.insert(succ_begin(UnwindDest), succ_end(UnwindDest));
+    // Same idea as above, we pretend we lose all these edges.
+    for (auto *Succ : successors(UnwindDest))
+      if (Inserted.insert(Succ).second)
+        DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                                    const_cast<BasicBlock *>(UnwindDest),
+                                    const_cast<BasicBlock *>(Succ));
   }
 
   // Exclude the CallSiteBB, if it happens to be its own successor (1-BB loop).
@@ -356,6 +376,33 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
     FPI.updateForBB(*BB, -1);
 }
 
+DominatorTree &FunctionPropertiesUpdater::getUpdatedDominatorTree(
+    FunctionAnalysisManager &FAM) const {
+  auto &DT =
+      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(Caller));
+
+  SmallVector<DominatorTree::UpdateType, 2> FinalDomTreeUpdates;
+
+  DenseSet<const BasicBlock *> Inserted;
+  for (auto *Succ : successors(&CallSiteBB))
+    if (Inserted.insert(Succ).second)
+      FinalDomTreeUpdates.push_back({DominatorTree::UpdateKind::Insert,
+                                     const_cast<BasicBlock *>(&CallSiteBB),
+                                     const_cast<BasicBlock *>(Succ)});
+
+  // Perform the deletes last, so that any new nodes connected to nodes
+  // participating in the edge deletion are known to the DT.
+  for (auto &Upd : DomTreeUpdates)
+    if (!llvm::is_contained(successors(Upd.getFrom()), Upd.getTo()))
+      FinalDomTreeUpdates.push_back(Upd);
+
+  DT.applyUpdates(FinalDomTreeUpdates);
+#ifdef EXPENSIVE_CHECKS
+  assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#endif
+  return DT;
+}
+
 void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
   // Update feature values from the BBs that were copied from the callee, or
   // might have been modified because of inlining. The latter have been
@@ -383,8 +430,7 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
   // remove E.
   SetVector<const BasicBlock *> Reinclude;
   SetVector<const BasicBlock *> Unreachable;
-  const auto &DT =
-      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(Caller));
+  auto &DT = getUpdatedDominatorTree(FAM);
 
   if (&CallSiteBB != &*Caller.begin())
     Reinclude.insert(&*Caller.begin());
@@ -427,11 +473,17 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
 
   const auto &LI = FAM.getResult<LoopAnalysis>(const_cast<Function &>(Caller));
   FPI.updateAggregateStats(Caller, LI);
+#ifdef EXPENSIVE_CHECKS
+  assert(isUpdateValid(Caller, FPI, FAM));
+#endif
 }
 
 bool FunctionPropertiesUpdater::isUpdateValid(Function &F,
                                               const FunctionPropertiesInfo &FPI,
                                               FunctionAnalysisManager &FAM) {
+  if (!FAM.getResult<DominatorTreeAnalysis>(F).verify(
+          DominatorTree::VerificationLevel::Full))
+    return false;
   DominatorTree DT(F);
   LoopInfo LI(DT);
   auto Fresh = FunctionPropertiesInfo::getFunctionPropertiesInfo(F, DT, LI);
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index b59aa4810005bc..8bb5efcf1b2ecb 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -288,7 +288,6 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   {
     PreservedAnalyses PA = PreservedAnalyses::all();
     PA.abandon<FunctionPropertiesAnalysis>();
-    PA.abandon<DominatorTreeAnalysis>();
     PA.abandon<LoopAnalysis>();
     FAM.invalidate(*Caller, PA);
   }
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index 3b3823589437a8..574ad7c8430e3e 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -999,4 +999,119 @@ declare <4 x ptr> @f4()
   EXPECT_EQ(DetailedF1Properties.CallReturnsVectorPointerCount, 1);
   EnableDetailedFunctionProperties.setValue(false);
 }
+
+TEST_F(FunctionPropertiesAnalysisTest, ReAddEdges) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = makeLLVMModule(C, R"IR(
+define hidden void @f1(ptr noundef %destatep, i32 noundef %offset, i8 noundef zeroext %byte1) {
+entry:
+  %cmp = icmp eq i8 %byte1, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  call fastcc void @f2(ptr noundef %destatep, i32 noundef 37, i32 noundef 600)
+  %and = and i32 %offset, 3
+  switch i32 %and, label %default.unreachable [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb1
+    i32 3, label %if.end
+  ]
+
+sw.bb:                                            ; preds = %if.then
+  call fastcc void @f2(ptr noundef %destatep, i32 noundef 57, i32 noundef 600)
+  br label %if.end
+
+sw.bb1:                                           ; preds = %if.then, %if.then
+  call fastcc void @f2(ptr noundef %destatep, i32 noundef 56, i32 noundef 600) #34
+  br label %if.end
+
+default.unreachable:                              ; preds = %if.then
+  unreachable
+
+if.else:                                          ; preds = %entry
+  call fastcc void @f2(ptr noundef %destatep, i32 noundef 56, i32 noundef 600)
+  br label %if.end
+
+if.end:                                           ; preds = %sw.bb, %sw.bb1, %if.then, %if.else
+  ret void
+}
+
+define internal fastcc void @f2(ptr nocapture noundef %destatep, i32 noundef %r_enc, i32 noundef %whack) {
+entry:
+  %enc_prob = getelementptr inbounds nuw i8, ptr %destatep, i32 512
+  %arrayidx = getelementptr inbounds [67 x i32], ptr %enc_prob, i32 0, i32 %r_enc
+  %0 = load i32, ptr %arrayidx, align 4
+  %sub = sub nsw i32 %0, %whack
+  store i32 %sub, ptr %arrayidx, align 4
+  ret void
+}
+  )IR");
+  auto *F1 = M->getFunction("f1");
+  auto *F2 = M->getFunction("f2");
+  auto *CB = [&]() -> CallBase * {
+    for (auto &BB : *F1)
+      for (auto &I : BB)
+        if (auto *CB = dyn_cast<CallBase>(&I);
+            CB && CB->getCalledFunction() && CB->getCalledFunction() == F2)
+          return CB;
+    return nullptr;
+  }();
+  ASSERT_NE(CB, nullptr);
+  auto FPI = buildFPI(*F1);
+  FunctionPropertiesUpdater FPU(FPI, *CB);
+  InlineFunctionInfo IFI;
+  auto IR = llvm::InlineFunction(*CB, IFI);
+  EXPECT_TRUE(IR.isSuccess());
+  invalidate(*F1);
+  EXPECT_TRUE(FPU.finishAndTest(FAM));
+}
+
+TEST_F(FunctionPropertiesAnalysisTest, InvokeLandingCanStillBeReached) {
+  LLVMContext C;
+  // %lpad is reachable from a block not involved in the inlining decision. We
+  // make sure that's not the entry - otherwise the DT will be recomputed from
+  // scratch. The idea here is that the edge known to the inliner to potentially
+  // disappear - %lpad->%ehcleanup -should survive because it is still reachable
+  // from %middle.
+  std::unique_ptr<Module> M = makeLLVMModule(C,
+                                             R"IR(
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define i64 @f1(i32 noundef %value) {
+entry:
+  br label %middle
+middle:
+  %c = icmp eq i32 %value, 0
+  br i1 %c, label %invoke, label %lpad
+invoke:
+  invoke fastcc void @f2() to label %cont unwind label %lpad
+cont:
+  br label %exit
+lpad:
+  %lp = landingpad i32 cleanup
+  br label %ehcleanup
+ehcleanup:
+  resume i32 0
+exit:
+  ret i64 1
+}
+define void @f2() {
+  ret void
+}
+)IR");
+
+  Function *F1 = M->getFunction("f1");
+  CallBase *CB = findCall(*F1);
+  EXPECT_NE(CB, nullptr);
+
+  auto FPI = buildFPI(*F1);
+  FunctionPropertiesUpdater FPU(FPI, *CB);
+  InlineFunctionInfo IFI;
+  auto IR = llvm::InlineFunction(*CB, IFI);
+  EXPECT_TRUE(IR.isSuccess());
+  invalidate(*F1);
+  EXPECT_TRUE(FPU.finishAndTest(FAM));
+}
 } // end anonymous namespace

From 7579787e05966f21684dd4b4a15b9deac13d09e1 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 29 Aug 2024 19:14:19 -0700
Subject: [PATCH 72/72] [clang-format] Correctly identify token-pasted record
 names (#106484)

See
https://github.com/llvm/llvm-project/pull/89706#issuecomment-2315549955.
---
 clang/lib/Format/UnwrappedLineParser.cpp      |  1 +
 clang/unittests/Format/TokenAnnotatorTest.cpp | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 93b23d36027a49..5b518bf6c859e8 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -4029,6 +4029,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
       }
       break;
     case tok::coloncolon:
+    case tok::hashhash:
       break;
     default:
       if (!JSPastExtendsOrImplements && !ClassName &&
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 618ca56eca3696..5aa5d93c1cb067 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3246,6 +3246,16 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
+  EXPECT_TOKEN(Tokens[8], tok::r_brace, TT_ClassRBrace);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+
+  Tokens = annotate("#define FOO(X) \\\n"
+                    "  struct X##_tag_ {};");
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_StructLBrace);
+  EXPECT_BRACE_KIND(Tokens[10], BK_Block);
+  EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace);
+  EXPECT_BRACE_KIND(Tokens[11], BK_Block);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {