From fc110202dffa06950716e0cc4535b07aaa2c439c Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 29 Aug 2024 08:00:25 -0700 Subject: [PATCH 01/72] [Support] Validate number of arguments passed to formatv() (#105745) Change formatv() to validate that the number of arguments passed matches number of replacement fields in the format string, and that the replacement indices do not contain holes. To support cases where this cannot be guaranteed, introduce a formatv() overload that allows disabling validation with a bool flag as its first argument. --- .../Checkers/StdLibraryFunctionsChecker.cpp | 5 +- llvm/benchmarks/CMakeLists.txt | 1 + llvm/benchmarks/FormatVariadicBM.cpp | 63 ++++++++++++++ llvm/include/llvm/Support/FormatVariadic.h | 39 +++++---- llvm/lib/Support/FormatVariadic.cpp | 85 ++++++++++++++++--- llvm/unittests/Support/FormatVariadicTest.cpp | 66 ++++++++------ mlir/tools/mlir-tblgen/OpFormatGen.cpp | 4 +- 7 files changed, 205 insertions(+), 58 deletions(-) create mode 100644 llvm/benchmarks/FormatVariadicBM.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 8f4bd17afc8581..4f30b2a0e7e7da 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1401,7 +1401,10 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, ErrnoNote = llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote); } else { - CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName); + // Disable formatv() validation as the case note may not always have the + // {0} placeholder for function name. + CaseNote = + llvm::formatv(false, Case.getNote().str().c_str(), FunctionName); } const SVal RV = Call.getReturnValue(); diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index 713d4ccd3c5975..e3366e6f3ffe19 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -5,3 +5,4 @@ set(LLVM_LINK_COMPONENTS add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED) +add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp new file mode 100644 index 00000000000000..c03ead400d0d5c --- /dev/null +++ b/llvm/benchmarks/FormatVariadicBM.cpp @@ -0,0 +1,63 @@ +//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +using namespace llvm; +using namespace std; + +// Generate a list of format strings that have `NumReplacements` replacements +// by permuting the replacements and some literal text. +static vector getFormatStrings(int NumReplacements) { + vector Components; + for (int I = 0; I < NumReplacements; I++) + Components.push_back("{" + to_string(I) + "}"); + // Intersperse these with some other literal text (_). + const string_view Literal = "____"; + for (char C : Literal) + Components.push_back(string(1, C)); + + vector Formats; + do { + string Concat; + for (const string &C : Components) + Concat += C; + Formats.emplace_back(Concat); + } while (next_permutation(Components.begin(), Components.end())); + return Formats; +} + +// Generate the set of formats to exercise outside the benchmark code. +static const vector> Formats = { + getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), + getFormatStrings(4), getFormatStrings(5), +}; + +// Benchmark formatv() for a variety of format strings and 1-5 replacements. +static void BM_FormatVariadic(benchmark::State &state) { + for (auto _ : state) { + for (const string &Fmt : Formats[0]) + formatv(Fmt.c_str(), 1).str(); + for (const string &Fmt : Formats[1]) + formatv(Fmt.c_str(), 1, 2).str(); + for (const string &Fmt : Formats[2]) + formatv(Fmt.c_str(), 1, 2, 3).str(); + for (const string &Fmt : Formats[3]) + formatv(Fmt.c_str(), 1, 2, 3, 4).str(); + for (const string &Fmt : Formats[4]) + formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); + } +} + +BENCHMARK(BM_FormatVariadic); + +BENCHMARK_MAIN(); diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index 595f2cf559a428..f31ad70021579e 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -67,23 +67,20 @@ class formatv_object_base { protected: StringRef Fmt; ArrayRef Adapters; - - static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad); - - static std::pair - splitLiteralAndReplacement(StringRef Fmt); + bool Validate; formatv_object_base(StringRef Fmt, - ArrayRef Adapters) - : Fmt(Fmt), Adapters(Adapters) {} + ArrayRef Adapters, + bool Validate) + : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {} formatv_object_base(formatv_object_base const &rhs) = delete; formatv_object_base(formatv_object_base &&rhs) = default; public: void format(raw_ostream &S) const { - for (auto &R : parseFormatString(Fmt)) { + const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate); + for (const auto &R : Replacements) { if (R.Type == ReplacementType::Empty) continue; if (R.Type == ReplacementType::Literal) { @@ -101,9 +98,10 @@ class formatv_object_base { Align.format(S, R.Options); } } - static SmallVector parseFormatString(StringRef Fmt); - static std::optional parseReplacementItem(StringRef Spec); + // Parse and optionally validate format string (in debug builds). + static SmallVector + parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate); std::string str() const { std::string Result; @@ -149,8 +147,8 @@ template class formatv_object : public formatv_object_base { }; public: - formatv_object(StringRef Fmt, Tuple &&Params) - : formatv_object_base(Fmt, ParameterPointers), + formatv_object(StringRef Fmt, Tuple &&Params, bool Validate) + : formatv_object_base(Fmt, ParameterPointers, Validate), Parameters(std::move(Params)) { ParameterPointers = std::apply(create_adapters(), Parameters); } @@ -247,15 +245,22 @@ template class formatv_object : public formatv_object_base { // assertion. Otherwise, it will try to do something reasonable, but in general // the details of what that is are undefined. // + +// formatv() with validation enable/disable controlled by the first argument. template -inline auto formatv(const char *Fmt, Ts &&...Vals) +inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) -> formatv_object(Vals))...))> { using ParamTuple = decltype(std::make_tuple( support::detail::build_format_adapter(std::forward(Vals))...)); - return formatv_object( - Fmt, std::make_tuple(support::detail::build_format_adapter( - std::forward(Vals))...)); + auto Params = std::make_tuple( + support::detail::build_format_adapter(std::forward(Vals))...); + return formatv_object(Fmt, std::move(Params), Validate); +} + +// formatv() with validation enabled. +template inline auto formatv(const char *Fmt, Ts &&...Vals) { + return formatv(true, Fmt, std::forward(Vals)...); } } // end namespace llvm diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp index e25d036cdf1e8c..26d2b549136e43 100644 --- a/llvm/lib/Support/FormatVariadic.cpp +++ b/llvm/lib/Support/FormatVariadic.cpp @@ -25,8 +25,8 @@ static std::optional translateLocChar(char C) { LLVM_BUILTIN_UNREACHABLE; } -bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad) { +static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, + size_t &Align, char &Pad) { Where = AlignStyle::Right; Align = 0; Pad = ' '; @@ -35,8 +35,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, if (Spec.size() > 1) { // A maximum of 2 characters at the beginning can be used for something - // other - // than the width. + // other than the width. // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...] // contains the width. // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width. @@ -55,8 +54,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, return !Failed; } -std::optional -formatv_object_base::parseReplacementItem(StringRef Spec) { +static std::optional parseReplacementItem(StringRef Spec) { StringRef RepString = Spec.trim("{}"); // If the replacement sequence does not start with a non-negative integer, @@ -82,15 +80,14 @@ formatv_object_base::parseReplacementItem(StringRef Spec) { RepString = StringRef(); } RepString = RepString.trim(); - if (!RepString.empty()) { - assert(false && "Unexpected characters found in replacement string!"); - } + assert(RepString.empty() && + "Unexpected characters found in replacement string!"); return ReplacementItem{Spec, Index, Align, Where, Pad, Options}; } -std::pair -formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) { +static std::pair +splitLiteralAndReplacement(StringRef Fmt) { while (!Fmt.empty()) { // Everything up until the first brace is a literal. if (Fmt.front() != '{') { @@ -143,15 +140,77 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) { return std::make_pair(ReplacementItem{Fmt}, StringRef()); } +#ifndef NDEBUG +#define ENABLE_VALIDATION 1 +#else +#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode. +#endif + SmallVector -formatv_object_base::parseFormatString(StringRef Fmt) { +formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs, + bool Validate) { SmallVector Replacements; - ReplacementItem I; + +#if ENABLE_VALIDATION + const StringRef SavedFmtStr = Fmt; + size_t NumExpectedArgs = 0; +#endif + while (!Fmt.empty()) { + ReplacementItem I; std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt); if (I.Type != ReplacementType::Empty) Replacements.push_back(I); +#if ENABLE_VALIDATION + if (I.Type == ReplacementType::Format) + NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1); +#endif + } + +#if ENABLE_VALIDATION + if (!Validate) + return Replacements; + + // Perform additional validation. Verify that the number of arguments matches + // the number of replacement indices and that there are no holes in the + // replacement indices. + + // When validation fails, return an array of replacement items that + // will print an error message as the outout of this formatv() (used when + // validation is enabled in release mode). + auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) { + return SmallVector{ + ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg), + ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)}; + }; + + if (NumExpectedArgs != NumArgs) { + errs() << formatv( + "Expected {0} Args, but got {1} for format string '{2}'\n", + NumExpectedArgs, NumArgs, SavedFmtStr); + assert(0 && "Invalid formatv() call"); + return getErrorReplacements("Unexpected number of arguments"); + } + + // Find the number of unique indices seen. All replacement indices + // are < NumExpectedArgs. + SmallVector Indices(NumExpectedArgs); + size_t Count = 0; + for (const ReplacementItem &I : Replacements) { + if (I.Type != ReplacementType::Format || Indices[I.Index]) + continue; + Indices[I.Index] = true; + ++Count; + } + + if (Count != NumExpectedArgs) { + errs() << formatv( + "Replacement field indices cannot have holes for format string '{0}'\n", + SavedFmtStr); + assert(0 && "Invalid format string"); + return getErrorReplacements("Replacement indices have holes"); } +#endif // ENABLE_VALIDATION return Replacements; } diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index a78b25c53d7e43..4c648d87fc2de7 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -9,9 +9,11 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatAdapters.h" +#include "gmock/gmock.h" #include "gtest/gtest.h" using namespace llvm; +using ::testing::HasSubstr; // Compile-time tests templates in the detail namespace. namespace { @@ -35,14 +37,19 @@ struct NoFormat {}; static_assert(uses_missing_provider::value, ""); } +// Helper to parse format string with no validation. +static SmallVector parseFormatString(StringRef Fmt) { + return formatv_object_base::parseFormatString(Fmt, 0, false); +} + TEST(FormatVariadicTest, EmptyFormatString) { - auto Replacements = formatv_object_base::parseFormatString(""); + auto Replacements = parseFormatString(""); EXPECT_EQ(0U, Replacements.size()); } TEST(FormatVariadicTest, NoReplacements) { const StringRef kFormatString = "This is a test"; - auto Replacements = formatv_object_base::parseFormatString(kFormatString); + auto Replacements = parseFormatString(kFormatString); ASSERT_EQ(1U, Replacements.size()); EXPECT_EQ(kFormatString, Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -50,25 +57,25 @@ TEST(FormatVariadicTest, NoReplacements) { TEST(FormatVariadicTest, EscapedBrace) { // {{ should be replaced with { - auto Replacements = formatv_object_base::parseFormatString("{{"); + auto Replacements = parseFormatString("{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // An even number N of braces should be replaced with N/2 braces. - Replacements = formatv_object_base::parseFormatString("{{{{{{"); + Replacements = parseFormatString("{{{{{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{{{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = formatv_object_base::parseFormatString("}"); + Replacements = parseFormatString("}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = formatv_object_base::parseFormatString("}}}"); + Replacements = parseFormatString("}}}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}}}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -76,14 +83,14 @@ TEST(FormatVariadicTest, EscapedBrace) { TEST(FormatVariadicTest, ValidReplacementSequence) { // 1. Simple replacement - parameter index only - auto Replacements = formatv_object_base::parseFormatString("{0}"); + auto Replacements = parseFormatString("{0}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); EXPECT_EQ(0u, Replacements[0].Align); EXPECT_EQ("", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{1}"); + Replacements = parseFormatString("{1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(1u, Replacements[0].Index); @@ -92,7 +99,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 2. Parameter index with right alignment - Replacements = formatv_object_base::parseFormatString("{0,3}"); + Replacements = parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -101,7 +108,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 3. And left alignment - Replacements = formatv_object_base::parseFormatString("{0,-3}"); + Replacements = parseFormatString("{0,-3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -110,7 +117,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. And center alignment - Replacements = formatv_object_base::parseFormatString("{0,=3}"); + Replacements = parseFormatString("{0,=3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -119,7 +126,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. Parameter index with option string - Replacements = formatv_object_base::parseFormatString("{0:foo}"); + Replacements = parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -128,7 +135,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 5. Parameter index with alignment before option string - Replacements = formatv_object_base::parseFormatString("{0,-3:foo}"); + Replacements = parseFormatString("{0,-3:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -137,7 +144,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 7. Parameter indices, options, and alignment can all have whitespace. - Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }"); + Replacements = parseFormatString("{ 0, -3 : foo }"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -147,7 +154,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { // 8. Everything after the first option specifier is part of the style, even // if it contains another option specifier. - Replacements = formatv_object_base::parseFormatString("{0:0:1}"); + Replacements = parseFormatString("{0:0:1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0:0:1", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -157,7 +164,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0:1", Replacements[0].Options); // 9. Custom padding character - Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}"); + Replacements = parseFormatString("{0,p+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,p+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -168,7 +175,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // Format string special characters are allowed as padding character - Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}"); + Replacements = parseFormatString("{0,-+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,-+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -178,7 +185,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('-', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}"); + Replacements = parseFormatString("{0,+-4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,+-4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -188,7 +195,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('+', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,==4:foo}"); + Replacements = parseFormatString("{0,==4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,==4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -198,7 +205,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('=', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}"); + Replacements = parseFormatString("{0,:=4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,:=4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -211,7 +218,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { TEST(FormatVariadicTest, DefaultReplacementValues) { // 2. If options string is missing, it defaults to empty. - auto Replacements = formatv_object_base::parseFormatString("{0,3}"); + auto Replacements = parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -219,7 +226,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // Including if the colon is present but contains no text. - Replacements = formatv_object_base::parseFormatString("{0,3:}"); + Replacements = parseFormatString("{0,3:}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -227,7 +234,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // 3. If alignment is missing, it defaults to 0, right, space - Replacements = formatv_object_base::parseFormatString("{0:foo}"); + Replacements = parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); @@ -238,8 +245,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { } TEST(FormatVariadicTest, MultipleReplacements) { - auto Replacements = - formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}"); + auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}"); ASSERT_EQ(5u, Replacements.size()); // {0} EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -704,6 +710,16 @@ TEST(FormatVariadicTest, FormatFilterRange) { EXPECT_EQ("1, 2, 3", formatv("{0}", Range).str()); } +#ifdef NDEBUG // Disable the test in debug builds where it will assert. +TEST(FormatVariadicTest, Validate) { + std::string Str = formatv("{0}", 1, 2).str(); + EXPECT_THAT(Str, HasSubstr("Unexpected number of arguments")); + + Str = formatv("{0} {2}", 1, 2, 3).str(); + EXPECT_THAT(Str, HasSubstr("eplacement indices have holes")); +} +#endif // NDEBUG + namespace { enum class Base { First }; diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 82f8718fc556ad..7016fe41ca75d0 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body, dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode; TypeSwitch(dir->getArg()) .Case([&](auto operand) { - body << formatv(parserCode, + body << formatv(false, parserCode, operand->getVar()->constraint.getCppType(), listName); }) .Default([&](auto operand) { - body << formatv(parserCode, "::mlir::Type", listName); + body << formatv(false, parserCode, "::mlir::Type", listName); }); } } else if (auto *dir = dyn_cast(element)) { From 025f03f01e8584140b7ac27422cea0c0ef7ef6c1 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 29 Aug 2024 17:05:02 +0200 Subject: [PATCH 02/72] [libc++][NFC] Remove unused struct in (#106527) --- libcxx/include/string | 8 -------- 1 file changed, 8 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index 05d42afb7c9c3d..45be4050304125 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -3462,14 +3462,6 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat // find -template -struct _LIBCPP_HIDDEN __traits_eq { - typedef typename _Traits::char_type char_type; - _LIBCPP_HIDE_FROM_ABI bool operator()(const char_type& __x, const char_type& __y) _NOEXCEPT { - return _Traits::eq(__x, __y); - } -}; - template _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { From a705e8cb5b071b3bf6d1d55629f18f6b7b9699ac Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 29 Aug 2024 17:05:56 +0200 Subject: [PATCH 03/72] [libc++][NFC] Remove __constexpr_is{nan,finite} (#106205) They're never used in `constexpr` functions, so we can simply use `std::isnan` and `std::isfinite` instead. --- libcxx/include/cmath | 28 ----------- libcxx/include/complex | 50 +++++++++---------- .../numerics/c.math/constexpr-fns.pass.cpp | 2 - 3 files changed, 24 insertions(+), 56 deletions(-) diff --git a/libcxx/include/cmath b/libcxx/include/cmath index 6480c4678ce33d..5d30b151870e0d 100644 --- a/libcxx/include/cmath +++ b/libcxx/include/cmath @@ -554,20 +554,6 @@ using ::scalbnl _LIBCPP_USING_IF_EXISTS; using ::tgammal _LIBCPP_USING_IF_EXISTS; using ::truncl _LIBCPP_USING_IF_EXISTS; -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { -#if __has_builtin(__builtin_isnan) - return __builtin_isnan(__lcpp_x); -#else - return isnan(__lcpp_x); -#endif -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { - return std::isnan(__lcpp_x); -} - template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT { #if __has_builtin(__builtin_isinf) @@ -582,20 +568,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NO return std::isinf(__lcpp_x); } -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { -#if __has_builtin(__builtin_isfinite) - return __builtin_isfinite(__lcpp_x); -#else - return isfinite(__lcpp_x); -#endif -} - -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { - return __builtin_isfinite(__lcpp_x); -} - #if _LIBCPP_STD_VER >= 20 template _LIBCPP_HIDE_FROM_ABI constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept { diff --git a/libcxx/include/complex b/libcxx/include/complex index e6534025de57e5..94fd8ee347dffb 100644 --- a/libcxx/include/complex +++ b/libcxx/include/complex @@ -1019,9 +1019,9 @@ inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_Co template _LIBCPP_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp()) { - if (std::__constexpr_isnan(__rho) || std::signbit(__rho)) + if (std::isnan(__rho) || std::signbit(__rho)) return complex<_Tp>(_Tp(NAN), _Tp(NAN)); - if (std::__constexpr_isnan(__theta)) { + if (std::isnan(__theta)) { if (std::__constexpr_isinf(__rho)) return complex<_Tp>(__rho, __theta); return complex<_Tp>(__theta, __theta); @@ -1032,10 +1032,10 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = return complex<_Tp>(_Tp(NAN), _Tp(NAN)); } _Tp __x = __rho * std::cos(__theta); - if (std::__constexpr_isnan(__x)) + if (std::isnan(__x)) __x = 0; _Tp __y = __rho * std::sin(__theta); - if (std::__constexpr_isnan(__y)) + if (std::isnan(__y)) __y = 0; return complex<_Tp>(__x, __y); } @@ -1062,10 +1062,8 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> sqrt(const complex<_Tp>& __x) { return complex<_Tp>(_Tp(INFINITY), __x.imag()); if (std::__constexpr_isinf(__x.real())) { if (__x.real() > _Tp(0)) - return complex<_Tp>( - __x.real(), std::__constexpr_isnan(__x.imag()) ? __x.imag() : std::copysign(_Tp(0), __x.imag())); - return complex<_Tp>( - std::__constexpr_isnan(__x.imag()) ? __x.imag() : _Tp(0), std::copysign(__x.real(), __x.imag())); + return complex<_Tp>(__x.real(), std::isnan(__x.imag()) ? __x.imag() : std::copysign(_Tp(0), __x.imag())); + return complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0), std::copysign(__x.real(), __x.imag())); } return std::polar(std::sqrt(std::abs(__x)), std::arg(__x) / _Tp(2)); } @@ -1080,9 +1078,9 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> exp(const complex<_Tp>& __x) { } if (std::__constexpr_isinf(__x.real())) { if (__x.real() < _Tp(0)) { - if (!std::__constexpr_isfinite(__i)) + if (!std::isfinite(__i)) __i = _Tp(1); - } else if (__i == 0 || !std::__constexpr_isfinite(__i)) { + } else if (__i == 0 || !std::isfinite(__i)) { if (std::__constexpr_isinf(__i)) __i = _Tp(NAN); return complex<_Tp>(__x.real(), __i); @@ -1131,13 +1129,13 @@ template _LIBCPP_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x) { const _Tp __pi(atan2(+0., -0.)); if (std::__constexpr_isinf(__x.real())) { - if (std::__constexpr_isnan(__x.imag())) + if (std::isnan(__x.imag())) return __x; if (std::__constexpr_isinf(__x.imag())) return complex<_Tp>(__x.real(), std::copysign(__pi * _Tp(0.25), __x.imag())); return complex<_Tp>(__x.real(), std::copysign(_Tp(0), __x.imag())); } - if (std::__constexpr_isnan(__x.real())) { + if (std::isnan(__x.real())) { if (std::__constexpr_isinf(__x.imag())) return complex<_Tp>(__x.imag(), __x.real()); if (__x.imag() == 0) @@ -1156,7 +1154,7 @@ template _LIBCPP_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) { const _Tp __pi(atan2(+0., -0.)); if (std::__constexpr_isinf(__x.real())) { - if (std::__constexpr_isnan(__x.imag())) + if (std::isnan(__x.imag())) return complex<_Tp>(std::abs(__x.real()), __x.imag()); if (std::__constexpr_isinf(__x.imag())) { if (__x.real() > 0) @@ -1168,7 +1166,7 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) { return complex<_Tp>(-__x.real(), std::copysign(__pi, __x.imag())); return complex<_Tp>(__x.real(), std::copysign(_Tp(0), __x.imag())); } - if (std::__constexpr_isnan(__x.real())) { + if (std::isnan(__x.real())) { if (std::__constexpr_isinf(__x.imag())) return complex<_Tp>(std::abs(__x.imag()), __x.real()); return complex<_Tp>(__x.real(), __x.real()); @@ -1187,12 +1185,12 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) { if (std::__constexpr_isinf(__x.imag())) { return complex<_Tp>(std::copysign(_Tp(0), __x.real()), std::copysign(__pi / _Tp(2), __x.imag())); } - if (std::__constexpr_isnan(__x.imag())) { + if (std::isnan(__x.imag())) { if (std::__constexpr_isinf(__x.real()) || __x.real() == 0) return complex<_Tp>(std::copysign(_Tp(0), __x.real()), __x.imag()); return complex<_Tp>(__x.imag(), __x.imag()); } - if (std::__constexpr_isnan(__x.real())) { + if (std::isnan(__x.real())) { return complex<_Tp>(__x.real(), __x.real()); } if (std::__constexpr_isinf(__x.real())) { @@ -1209,11 +1207,11 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) { template _LIBCPP_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) { - if (std::__constexpr_isinf(__x.real()) && !std::__constexpr_isfinite(__x.imag())) + if (std::__constexpr_isinf(__x.real()) && !std::isfinite(__x.imag())) return complex<_Tp>(__x.real(), _Tp(NAN)); - if (__x.real() == 0 && !std::__constexpr_isfinite(__x.imag())) + if (__x.real() == 0 && !std::isfinite(__x.imag())) return complex<_Tp>(__x.real(), _Tp(NAN)); - if (__x.imag() == 0 && !std::__constexpr_isfinite(__x.real())) + if (__x.imag() == 0 && !std::isfinite(__x.real())) return __x; return complex<_Tp>(std::sinh(__x.real()) * std::cos(__x.imag()), std::cosh(__x.real()) * std::sin(__x.imag())); } @@ -1222,13 +1220,13 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) { template _LIBCPP_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) { - if (std::__constexpr_isinf(__x.real()) && !std::__constexpr_isfinite(__x.imag())) + if (std::__constexpr_isinf(__x.real()) && !std::isfinite(__x.imag())) return complex<_Tp>(std::abs(__x.real()), _Tp(NAN)); - if (__x.real() == 0 && !std::__constexpr_isfinite(__x.imag())) + if (__x.real() == 0 && !std::isfinite(__x.imag())) return complex<_Tp>(_Tp(NAN), __x.real()); if (__x.real() == 0 && __x.imag() == 0) return complex<_Tp>(_Tp(1), __x.imag()); - if (__x.imag() == 0 && !std::__constexpr_isfinite(__x.real())) + if (__x.imag() == 0 && !std::isfinite(__x.real())) return complex<_Tp>(std::abs(__x.real()), __x.imag()); return complex<_Tp>(std::cosh(__x.real()) * std::cos(__x.imag()), std::sinh(__x.real()) * std::sin(__x.imag())); } @@ -1238,11 +1236,11 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) { template _LIBCPP_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x) { if (std::__constexpr_isinf(__x.real())) { - if (!std::__constexpr_isfinite(__x.imag())) + if (!std::isfinite(__x.imag())) return complex<_Tp>(std::copysign(_Tp(1), __x.real()), _Tp(0)); return complex<_Tp>(std::copysign(_Tp(1), __x.real()), std::copysign(_Tp(0), std::sin(_Tp(2) * __x.imag()))); } - if (std::__constexpr_isnan(__x.real()) && __x.imag() == 0) + if (std::isnan(__x.real()) && __x.imag() == 0) return __x; _Tp __2r(_Tp(2) * __x.real()); _Tp __2i(_Tp(2) * __x.imag()); @@ -1267,7 +1265,7 @@ template _LIBCPP_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) { const _Tp __pi(atan2(+0., -0.)); if (std::__constexpr_isinf(__x.real())) { - if (std::__constexpr_isnan(__x.imag())) + if (std::isnan(__x.imag())) return complex<_Tp>(__x.imag(), __x.real()); if (std::__constexpr_isinf(__x.imag())) { if (__x.real() < _Tp(0)) @@ -1278,7 +1276,7 @@ _LIBCPP_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) { return complex<_Tp>(__pi, std::signbit(__x.imag()) ? -__x.real() : __x.real()); return complex<_Tp>(_Tp(0), std::signbit(__x.imag()) ? __x.real() : -__x.real()); } - if (std::__constexpr_isnan(__x.real())) { + if (std::isnan(__x.real())) { if (std::__constexpr_isinf(__x.imag())) return complex<_Tp>(__x.real(), -__x.imag()); return complex<_Tp>(__x.real(), __x.real()); diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp index 3739bc6ef04dd0..ff36293830c5de 100644 --- a/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp +++ b/libcxx/test/libcxx/numerics/c.math/constexpr-fns.pass.cpp @@ -20,9 +20,7 @@ #include "test_macros.h" -static_assert(std::__constexpr_isnan(0.) == false, ""); static_assert(std::__constexpr_isinf(0.0) == false, ""); -static_assert(std::__constexpr_isfinite(0.0) == true, ""); int main(int, char**) { From 032c3283ab419377a1230a32d98693b528f63134 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 29 Aug 2024 08:06:45 -0700 Subject: [PATCH 04/72] [NFC][TableGen] Refactor IntrinsicEmitter code (#106479) - Use formatv() and raw string literals to simplify emission code. - Use range based for loops and structured bindings to simplify loops. - Use const Pointers to Records. - Rename `ComputeFixedEncoding` to `ComputeTypeSignature` to reflect what the function actually does, cnd change it to return a vector. - Use reverse() and range based for loop to pack 8 nibbles into 32-bits. - Rename some variables to follow LLVM coding standards. - For function memory effects, print human readable effects in comment. --- llvm/test/TableGen/intrinsic-attrs.td | 2 +- llvm/utils/TableGen/IntrinsicEmitter.cpp | 393 ++++++++++++----------- 2 files changed, 204 insertions(+), 191 deletions(-) diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td index 22019b8fb87140..29e8cb1e89bb01 100644 --- a/llvm/test/TableGen/intrinsic-attrs.td +++ b/llvm/test/TableGen/intrinsic-attrs.td @@ -60,7 +60,7 @@ def int_deref_ptr_ret : Intrinsic<[llvm_ptr_ty], [], [DereferenceableOffset, e = Set->Offset + Set->Count; i != e; ++i) { - OS << " " << Ints[i].EnumName; + bool First = true; + for (const auto &Int : ArrayRef(&Ints[Set->Offset], Set->Count)) { + OS << " " << Int.EnumName; // Assign a value to the first intrinsic in this target set so that all // intrinsic ids are distinct. - if (i == Set->Offset) - OS << " = " << (Set->Offset + 1); + if (First) { + OS << " = " << Set->Offset + 1; + First = false; + } OS << ", "; - if (Ints[i].EnumName.size() < 40) - OS.indent(40 - Ints[i].EnumName.size()); - OS << " // " << Ints[i].Name << "\n"; + if (Int.EnumName.size() < 40) + OS.indent(40 - Int.EnumName.size()); + OS << formatv(" // {0}\n", Int.Name); } // Emit num_intrinsics into the target neutral enum. if (IntrinsicPrefix.empty()) { - OS << " num_intrinsics = " << (Ints.size() + 1) << "\n"; + OS << formatv(" num_intrinsics = {0}\n", Ints.size() + 1); OS << "#endif\n\n"; } else { - OS << "}; // enum\n"; - OS << "} // namespace llvm::Intrinsic\n\n"; - OS << "#endif\n"; + OS << R"(}; // enum +} // namespace llvm::Intrinsic +#endif + +)"; } } @@ -181,8 +186,8 @@ void IntrinsicEmitter::EmitArgKind(raw_ostream &OS) { return; OS << "// llvm::Intrinsic::IITDescriptor::ArgKind.\n"; OS << "#ifdef GET_INTRINSIC_ARGKIND\n"; - if (auto RecArgKind = Records.getDef("ArgKind")) { - for (auto &RV : RecArgKind->getValues()) + if (const auto RecArgKind = Records.getDef("ArgKind")) { + for (const auto &RV : RecArgKind->getValues()) OS << " AK_" << RV.getName() << " = " << *RV.getValue() << ",\n"; } else { OS << "#error \"ArgKind is not defined\"\n"; @@ -194,7 +199,7 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) { OS << "#ifdef GET_INTRINSIC_IITINFO\n"; std::array RecsByNumber; auto IIT_Base = Records.getAllDerivedDefinitionsIfDefined("IIT_Base"); - for (auto Rec : IIT_Base) { + for (const Record *Rec : IIT_Base) { auto Number = Rec->getValueAsInt("Number"); assert(0 <= Number && Number < (int)RecsByNumber.size() && "IIT_Info.Number should be uint8_t"); @@ -213,26 +218,29 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) { void IntrinsicEmitter::EmitTargetInfo(const CodeGenIntrinsicTable &Ints, raw_ostream &OS) { - OS << "// Target mapping.\n"; - OS << "#ifdef GET_INTRINSIC_TARGET_DATA\n"; - OS << "struct IntrinsicTargetInfo {\n" - << " llvm::StringLiteral Name;\n" - << " size_t Offset;\n" - << " size_t Count;\n" - << "};\n"; - OS << "static constexpr IntrinsicTargetInfo TargetInfos[] = {\n"; - for (const auto &Target : Ints.Targets) - OS << " {llvm::StringLiteral(\"" << Target.Name << "\"), " << Target.Offset - << ", " << Target.Count << "},\n"; - OS << "};\n"; - OS << "#endif\n\n"; + OS << R"(// Target mapping. +#ifdef GET_INTRINSIC_TARGET_DATA +struct IntrinsicTargetInfo { + StringLiteral Name; + size_t Offset; + size_t Count; +}; +static constexpr IntrinsicTargetInfo TargetInfos[] = { +)"; + for (const auto [Name, Offset, Count] : Ints.Targets) + OS << formatv(" {{\"{0}\", {1}, {2}},\n", Name, Offset, Count); + OS << R"(}; +#endif + +)"; } void IntrinsicEmitter::EmitIntrinsicToNameTable( const CodeGenIntrinsicTable &Ints, raw_ostream &OS) { - OS << "// Intrinsic ID to name table.\n"; - OS << "#ifdef GET_INTRINSIC_NAME_TABLE\n"; - OS << " // Note that entry #0 is the invalid intrinsic!\n"; + OS << R"(// Intrinsic ID to name table. +#ifdef GET_INTRINSIC_NAME_TABLE +// Note that entry #0 is the invalid intrinsic! +)"; for (const auto &Int : Ints) OS << " \"" << Int.Name << "\",\n"; OS << "#endif\n\n"; @@ -240,16 +248,19 @@ void IntrinsicEmitter::EmitIntrinsicToNameTable( void IntrinsicEmitter::EmitIntrinsicToOverloadTable( const CodeGenIntrinsicTable &Ints, raw_ostream &OS) { - OS << "// Intrinsic ID to overload bitset.\n"; - OS << "#ifdef GET_INTRINSIC_OVERLOAD_TABLE\n"; - OS << "static constexpr uint8_t OTable[] = {\n"; - OS << " 0"; - for (unsigned i = 0, e = Ints.size(); i != e; ++i) { + OS << R"(// Intrinsic ID to overload bitset. +#ifdef GET_INTRINSIC_OVERLOAD_TABLE +static constexpr uint8_t OTable[] = { + 0 + )"; + for (auto [I, Int] : enumerate(Ints)) { // Add one to the index so we emit a null bit for the invalid #0 intrinsic. - if ((i + 1) % 8 == 0) + size_t Idx = I + 1; + + if (Idx % 8 == 0) OS << ",\n 0"; - if (Ints[i].isOverloaded) - OS << " | (1<<" << (i + 1) % 8 << ')'; + if (Int.isOverloaded) + OS << " | (1<<" << Idx % 8 << ')'; } OS << "\n};\n\n"; // OTable contains a true bit at the position if the intrinsic is overloaded. @@ -257,20 +268,18 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable( OS << "#endif\n\n"; } -/// ComputeFixedEncoding - If we can encode the type signature for this -/// intrinsic into 32 bits, return it. If not, return ~0U. -static void ComputeFixedEncoding(const CodeGenIntrinsic &Int, - std::vector &TypeSig) { - if (auto *R = Int.TheDef->getValue("TypeSig")) { - for (auto &a : cast(R->getValue())->getValues()) { - for (auto &b : cast(a)->getValues()) - TypeSig.push_back(cast(b)->getValue()); +using TypeSigTy = SmallVector; + +/// Computes type signature of the intrinsic \p Int. +static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) { + TypeSigTy TypeSig; + if (const auto *R = Int.TheDef->getValue("TypeSig")) { + for (const auto *a : cast(R->getValue())->getValues()) { + for (const auto *b : cast(a)->getValues()) + TypeSig.emplace_back(cast(b)->getValue()); } } -} - -static void printIITEntry(raw_ostream &OS, unsigned char X) { - OS << (unsigned)X; + return TypeSig; } void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, @@ -278,29 +287,28 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, // If we can compute a 32-bit fixed encoding for this intrinsic, do so and // capture it in this vector, otherwise store a ~0U. std::vector FixedEncodings; + SequenceToOffsetTable LongEncodingTable; - SequenceToOffsetTable> LongEncodingTable; - - std::vector TypeSig; + FixedEncodings.reserve(Ints.size()); // Compute the unique argument type info. - for (unsigned i = 0, e = Ints.size(); i != e; ++i) { + for (const CodeGenIntrinsic &Int : Ints) { // Get the signature for the intrinsic. - TypeSig.clear(); - ComputeFixedEncoding(Ints[i], TypeSig); + TypeSigTy TypeSig = ComputeTypeSignature(Int); - // Check to see if we can encode it into a 32-bit word. We can only encode + // Check to see if we can encode it into a 32-bit word. We can only encode // 8 nibbles into a 32-bit word. if (TypeSig.size() <= 8) { - bool Failed = false; + // Attempt to pack elements of TypeSig into a 32-bit word, starting from + // the most significant nibble. unsigned Result = 0; - for (unsigned i = 0, e = TypeSig.size(); i != e; ++i) { - // If we had an unencodable argument, bail out. - if (TypeSig[i] > 15) { + bool Failed = false; + for (unsigned char C : reverse(TypeSig)) { + if (C > 15) { Failed = true; break; } - Result = (Result << 4) | TypeSig[e - i - 1]; + Result = (Result << 4) | C; } // If this could be encoded into a 31-bit word, return it. @@ -320,23 +328,22 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, LongEncodingTable.layout(); - OS << "// Global intrinsic function declaration type table.\n"; - OS << "#ifdef GET_INTRINSIC_GENERATOR_GLOBAL\n"; + OS << R"(// Global intrinsic function declaration type table. +#ifdef GET_INTRINSIC_GENERATOR_GLOBAL +static constexpr unsigned IIT_Table[] = { + )"; - OS << "static constexpr unsigned IIT_Table[] = {\n "; - - for (unsigned i = 0, e = FixedEncodings.size(); i != e; ++i) { - if ((i & 7) == 7) + for (auto [Idx, FixedEncoding, Int] : enumerate(FixedEncodings, Ints)) { + if ((Idx & 7) == 7) OS << "\n "; // If the entry fit in the table, just emit it. - if (FixedEncodings[i] != ~0U) { - OS << "0x" << Twine::utohexstr(FixedEncodings[i]) << ", "; + if (FixedEncoding != ~0U) { + OS << "0x" << Twine::utohexstr(FixedEncoding) << ", "; continue; } - TypeSig.clear(); - ComputeFixedEncoding(Ints[i], TypeSig); + TypeSigTy TypeSig = ComputeTypeSignature(Int); // Otherwise, emit the offset into the long encoding table. We emit it this // way so that it is easier to read the offset in the .def file. @@ -348,7 +355,8 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints, // Emit the shared table of register lists. OS << "static constexpr unsigned char IIT_LongEncodingTable[] = {\n"; if (!LongEncodingTable.empty()) - LongEncodingTable.emit(OS, printIITEntry); + LongEncodingTable.emit( + OS, [](raw_ostream &OS, unsigned char C) { OS << (unsigned)C; }); OS << " 255\n};\n\n"; OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL @@ -399,16 +407,14 @@ struct AttributeComparator { /// EmitAttributes - This emits the Intrinsic::getAttributes method. void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS) { - OS << "// Add parameter attributes that are not common to all intrinsics.\n"; - OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n"; - + OS << R"(// Add parameter attributes that are not common to all intrinsics. +#ifdef GET_INTRINSIC_ATTRIBUTES +static AttributeSet getIntrinsicArgAttributeSet(LLVMContext &C, unsigned ID) { + switch (ID) { + default: llvm_unreachable("Invalid attribute set number");)"; // Compute unique argument attribute sets. std::map, unsigned> UniqArgAttributes; - OS << "static AttributeSet getIntrinsicArgAttributeSet(" - << "LLVMContext &C, unsigned ID) {\n" - << " switch (ID) {\n" - << " default: llvm_unreachable(\"Invalid attribute set number\");\n"; for (const CodeGenIntrinsic &Int : Ints) { for (auto &Attrs : Int.ArgumentAttributes) { if (Attrs.empty()) @@ -419,118 +425,127 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, continue; assert(is_sorted(Attrs) && "Argument attributes are not sorted"); - - OS << " case " << ID << ":\n"; - OS << " return AttributeSet::get(C, {\n"; - for (const CodeGenIntrinsic::ArgAttribute &Attr : Attrs) { - switch (Attr.Kind) { + auto getAttrEnumName = + [](CodeGenIntrinsic::ArgAttrKind Kind) -> StringRef { + switch (Kind) { case CodeGenIntrinsic::NoCapture: - OS << " Attribute::get(C, Attribute::NoCapture),\n"; - break; + return "NoCapture"; case CodeGenIntrinsic::NoAlias: - OS << " Attribute::get(C, Attribute::NoAlias),\n"; - break; + return "NoAlias"; case CodeGenIntrinsic::NoUndef: - OS << " Attribute::get(C, Attribute::NoUndef),\n"; - break; + return "NoUndef"; case CodeGenIntrinsic::NonNull: - OS << " Attribute::get(C, Attribute::NonNull),\n"; - break; + return "NonNull"; case CodeGenIntrinsic::Returned: - OS << " Attribute::get(C, Attribute::Returned),\n"; - break; + return "Returned"; case CodeGenIntrinsic::ReadOnly: - OS << " Attribute::get(C, Attribute::ReadOnly),\n"; - break; + return "ReadOnly"; case CodeGenIntrinsic::WriteOnly: - OS << " Attribute::get(C, Attribute::WriteOnly),\n"; - break; + return "WriteOnly"; case CodeGenIntrinsic::ReadNone: - OS << " Attribute::get(C, Attribute::ReadNone),\n"; - break; + return "ReadNone"; case CodeGenIntrinsic::ImmArg: - OS << " Attribute::get(C, Attribute::ImmArg),\n"; - break; + return "ImmArg"; case CodeGenIntrinsic::Alignment: - OS << " Attribute::get(C, Attribute::Alignment, " << Attr.Value - << "),\n"; - break; + return "Alignment"; case CodeGenIntrinsic::Dereferenceable: - OS << " Attribute::get(C, Attribute::Dereferenceable, " - << Attr.Value << "),\n"; - break; + return "Dereferenceable"; } + }; + + OS << formatv(R"( + case {0}: + return AttributeSet::get(C, {{ +)", + ID); + for (const CodeGenIntrinsic::ArgAttribute &Attr : Attrs) { + StringRef AttrName = getAttrEnumName(Attr.Kind); + if (Attr.Kind == CodeGenIntrinsic::Alignment || + Attr.Kind == CodeGenIntrinsic::Dereferenceable) + OS << formatv(" Attribute::get(C, Attribute::{0}, {1}),\n", + AttrName, Attr.Value); + else + OS << formatv(" Attribute::get(C, Attribute::{0}),\n", AttrName); } - OS << " });\n"; + OS << " });"; } } - OS << " }\n"; - OS << "}\n\n"; + OS << R"( + } +} // getIntrinsicArgAttributeSet)"; // Compute unique function attribute sets. std::map UniqFnAttributes; - OS << "static AttributeSet getIntrinsicFnAttributeSet(" - << "LLVMContext &C, unsigned ID) {\n" - << " switch (ID) {\n" - << " default: llvm_unreachable(\"Invalid attribute set number\");\n"; + OS << R"( +static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { + switch (ID) { + default: llvm_unreachable("Invalid attribute set number");)"; for (const CodeGenIntrinsic &Intrinsic : Ints) { unsigned ID = UniqFnAttributes.size(); if (!UniqFnAttributes.try_emplace(&Intrinsic, ID).second) continue; - - OS << " case " << ID << ":\n" - << " return AttributeSet::get(C, {\n"; + OS << formatv(R"( + case {0}: + return AttributeSet::get(C, {{ +)", + ID); + auto addAttribute = [&OS](StringRef Attr) { + OS << formatv(" Attribute::get(C, Attribute::{0}),\n", Attr); + }; if (!Intrinsic.canThrow) - OS << " Attribute::get(C, Attribute::NoUnwind),\n"; + addAttribute("NoUnwind"); if (Intrinsic.isNoReturn) - OS << " Attribute::get(C, Attribute::NoReturn),\n"; + addAttribute("NoReturn"); if (Intrinsic.isNoCallback) - OS << " Attribute::get(C, Attribute::NoCallback),\n"; + addAttribute("NoCallback"); if (Intrinsic.isNoSync) - OS << " Attribute::get(C, Attribute::NoSync),\n"; + addAttribute("NoSync"); if (Intrinsic.isNoFree) - OS << " Attribute::get(C, Attribute::NoFree),\n"; + addAttribute("NoFree"); if (Intrinsic.isWillReturn) - OS << " Attribute::get(C, Attribute::WillReturn),\n"; + addAttribute("WillReturn"); if (Intrinsic.isCold) - OS << " Attribute::get(C, Attribute::Cold),\n"; + addAttribute("Cold"); if (Intrinsic.isNoDuplicate) - OS << " Attribute::get(C, Attribute::NoDuplicate),\n"; + addAttribute("NoDuplicate"); if (Intrinsic.isNoMerge) - OS << " Attribute::get(C, Attribute::NoMerge),\n"; + addAttribute("NoMerge"); if (Intrinsic.isConvergent) - OS << " Attribute::get(C, Attribute::Convergent),\n"; + addAttribute("Convergent"); if (Intrinsic.isSpeculatable) - OS << " Attribute::get(C, Attribute::Speculatable),\n"; + addAttribute("Speculatable"); if (Intrinsic.isStrictFP) - OS << " Attribute::get(C, Attribute::StrictFP),\n"; + addAttribute("StrictFP"); MemoryEffects ME = Intrinsic.ME; // TODO: IntrHasSideEffects should affect not only readnone intrinsics. if (ME.doesNotAccessMemory() && Intrinsic.hasSideEffects) ME = MemoryEffects::unknown(); if (ME != MemoryEffects::unknown()) { - OS << " Attribute::getWithMemoryEffects(C, " - << "MemoryEffects::createFromIntValue(" << ME.toIntValue() << ")),\n"; + OS << formatv(" // {0}\n", ME); + OS << formatv(" Attribute::getWithMemoryEffects(C, " + "MemoryEffects::createFromIntValue({0})),\n", + ME.toIntValue()); } OS << " });\n"; } - OS << " }\n"; - OS << "}\n\n"; - OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n"; + OS << R"( } +} // getIntrinsicFnAttributeSet + +AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) { +)"; - // Compute the maximum number of attribute arguments and the map + // Compute the maximum number of attribute arguments and the map. typedef std::map UniqAttrMapTy; UniqAttrMapTy UniqAttributes; - unsigned maxArgAttrs = 0; + unsigned MaxArgAttrs = 0; unsigned AttrNum = 0; - for (unsigned i = 0, e = Ints.size(); i != e; ++i) { - const CodeGenIntrinsic &intrinsic = Ints[i]; - maxArgAttrs = - std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size())); - unsigned &N = UniqAttributes[&intrinsic]; + for (const CodeGenIntrinsic &Int : Ints) { + MaxArgAttrs = + std::max(MaxArgAttrs, unsigned(Int.ArgumentAttributes.size())); + unsigned &N = UniqAttributes[&Int]; if (N) continue; N = ++AttrNum; @@ -539,67 +554,65 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, // Emit an array of AttributeList. Most intrinsics will have at least one // entry, for the function itself (index ~1), which is usually nounwind. - OS << " static constexpr uint16_t IntrinsicsToAttributesMap[] = {\n"; - - for (unsigned i = 0, e = Ints.size(); i != e; ++i) { - const CodeGenIntrinsic &intrinsic = Ints[i]; + OS << " static constexpr uint16_t IntrinsicsToAttributesMap[] = {"; + for (const CodeGenIntrinsic &Int : Ints) + OS << formatv("\n {0}, // {1}", UniqAttributes[&Int], Int.Name); - OS << " " << UniqAttributes[&intrinsic] << ", // " << intrinsic.Name - << "\n"; - } - OS << " };\n\n"; - - OS << " std::pair AS[" << maxArgAttrs + 1 << "];\n"; - OS << " unsigned NumAttrs = 0;\n"; - OS << " if (id != 0) {\n"; - OS << " switch(IntrinsicsToAttributesMap[id - 1]) {\n"; - OS << " default: llvm_unreachable(\"Invalid attribute number\");\n"; - for (auto UniqAttribute : UniqAttributes) { - OS << " case " << UniqAttribute.second << ": {\n"; + OS << formatv(R"( + }; + std::pair AS[{0}]; + unsigned NumAttrs = 0; + if (id != 0) {{ + switch(IntrinsicsToAttributesMap[id - 1]) {{ + default: llvm_unreachable("Invalid attribute number"); +)", + MaxArgAttrs + 1); - const CodeGenIntrinsic &Intrinsic = *(UniqAttribute.first); + for (const auto [IntPtr, UniqueID] : UniqAttributes) { + OS << formatv(" case {0}:\n", UniqueID); + const CodeGenIntrinsic &Int = *IntPtr; // Keep track of the number of attributes we're writing out. - unsigned numAttrs = 0; + unsigned NumAttrs = 0; - for (const auto &[AttrIdx, Attrs] : - enumerate(Intrinsic.ArgumentAttributes)) { + for (const auto &[AttrIdx, Attrs] : enumerate(Int.ArgumentAttributes)) { if (Attrs.empty()) continue; - unsigned ID = UniqArgAttributes.find(Attrs)->second; - OS << " AS[" << numAttrs++ << "] = {" << AttrIdx - << ", getIntrinsicArgAttributeSet(C, " << ID << ")};\n"; + unsigned ArgAttrID = UniqArgAttributes.find(Attrs)->second; + OS << formatv( + " AS[{0}] = {{{1}, getIntrinsicArgAttributeSet(C, {2})};\n", + NumAttrs++, AttrIdx, ArgAttrID); } - if (!Intrinsic.canThrow || - (Intrinsic.ME != MemoryEffects::unknown() && - !Intrinsic.hasSideEffects) || - Intrinsic.isNoReturn || Intrinsic.isNoCallback || Intrinsic.isNoSync || - Intrinsic.isNoFree || Intrinsic.isWillReturn || Intrinsic.isCold || - Intrinsic.isNoDuplicate || Intrinsic.isNoMerge || - Intrinsic.isConvergent || Intrinsic.isSpeculatable || - Intrinsic.isStrictFP) { - unsigned ID = UniqFnAttributes.find(&Intrinsic)->second; - OS << " AS[" << numAttrs++ << "] = {AttributeList::FunctionIndex, " - << "getIntrinsicFnAttributeSet(C, " << ID << ")};\n"; + if (!Int.canThrow || + (Int.ME != MemoryEffects::unknown() && !Int.hasSideEffects) || + Int.isNoReturn || Int.isNoCallback || Int.isNoSync || Int.isNoFree || + Int.isWillReturn || Int.isCold || Int.isNoDuplicate || Int.isNoMerge || + Int.isConvergent || Int.isSpeculatable || Int.isStrictFP) { + unsigned FnAttrID = UniqFnAttributes.find(&Int)->second; + OS << formatv(" AS[{0}] = {{AttributeList::FunctionIndex, " + "getIntrinsicFnAttributeSet(C, {1})};\n", + NumAttrs++, FnAttrID); } - if (numAttrs) { - OS << " NumAttrs = " << numAttrs << ";\n"; - OS << " break;\n"; - OS << " }\n"; + if (NumAttrs) { + OS << formatv(R"( NumAttrs = {0}; + break; +)", + NumAttrs); } else { OS << " return AttributeList();\n"; - OS << " }\n"; } } - OS << " }\n"; - OS << " }\n"; - OS << " return AttributeList::get(C, ArrayRef(AS, NumAttrs));\n"; - OS << "}\n"; - OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n"; + OS << R"( } + } + return AttributeList::get(C, ArrayRef(AS, NumAttrs)); +} +#endif // GET_INTRINSIC_ATTRIBUTES + +)"; } void IntrinsicEmitter::EmitIntrinsicToBuiltinMap( From 4ee2ad259812159c4f51bf2d8edcf0376302b2c3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 29 Aug 2024 19:36:24 +0400 Subject: [PATCH 05/72] AArch64: Add tests for atomicrmw fp operations (#103701) There were only codegen tests for the fadd vector case, so round out the test coverage for the scalar cases and all the other operations. --- .../AArch64/atomicrmw-fadd-fp-vector.ll | 115 -- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 1209 ++++++++++++++++ llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 1272 +++++++++++++++++ llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 1272 +++++++++++++++++ llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 1209 ++++++++++++++++ 5 files changed, 4962 insertions(+), 115 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll deleted file mode 100644 index a7539ac3cce802..00000000000000 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll +++ /dev/null @@ -1,115 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s -; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s - -define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4: -; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB0_2 -; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4: -; LSE: // %bb.0: -; LSE-NEXT: fcvtl v1.4s, v0.4h -; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: .LBB0_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fcvtl v2.4s, v0.4h -; LSE-NEXT: fmov w8, s0 -; LSE-NEXT: mov w10, w8 -; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; LSE-NEXT: fcvtn v2.4h, v2.4s -; LSE-NEXT: fmov w9, s2 -; LSE-NEXT: casal w10, w9, [x0] -; LSE-NEXT: fmov s0, w10 -; LSE-NEXT: cmp w10, w8 -; LSE-NEXT: b.ne .LBB0_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; LSE-NEXT: ret - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4 - ret <2 x half> %res -} - -define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8: -; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB1_2 -; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 -; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; NOLSE-NEXT: fmov d0, d1 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8: -; LSE: // %bb.0: -; LSE-NEXT: ldr d1, [x0] -; LSE-NEXT: .LBB1_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; LSE-NEXT: fmov x8, d1 -; LSE-NEXT: mov x10, x8 -; LSE-NEXT: fmov x9, d2 -; LSE-NEXT: casal x10, x9, [x0] -; LSE-NEXT: fmov d1, x10 -; LSE-NEXT: cmp x10, x8 -; LSE-NEXT: b.ne .LBB1_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: fmov d0, d1 -; LSE-NEXT: ret - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8 - ret <2 x float> %res -} - -attributes #0 = { nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll new file mode 100644 index 00000000000000..89c9880ffc7868 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -0,0 +1,1209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fadd s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fadd d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl __addtf3 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl __addtf3 +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl __addtf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvtl v1.4s, v0.4h +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: fcvtl v2.4s, v0.4h +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s +; NOLSE-NEXT: fcvtn v2.4h, v2.4s +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvtl v1.4s, v0.4h +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvtl v2.4s, v0.4h +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s +; LSE-NEXT: fcvtn v2.4h, v2.4s +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: movi v1.4s, #1 +; NOLSE-NEXT: movi v2.4s, #127, msl #8 +; NOLSE-NEXT: shll v3.4s, v0.4h, #16 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: shll v4.4s, v0.4h, #16 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s +; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 +; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b +; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s +; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: movi v1.4s, #1 +; LSE-NEXT: movi v2.4s, #127, msl #8 +; LSE-NEXT: shll v3.4s, v0.4h, #16 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: shll v4.4s, v0.4h, #16 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: fadd v4.4s, v4.4s, v3.4s +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: ushr v5.4s, v4.4s, #16 +; LSE-NEXT: and v5.16b, v5.16b, v1.16b +; LSE-NEXT: add v4.4s, v5.4s, v4.4s +; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fadd v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll new file mode 100644 index 00000000000000..998d8ae0c1de4d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -0,0 +1,1272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +; FIXME: Windows hosts assigns stack slots to different offsets for some reason. +; UNSUPPORTED: system-windows + +define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fmaxnm s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fmaxnm d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl fmaxl +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl fmaxl +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl fmaxl +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s1, h1 +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fcvt s4, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fcvt s3, h3 +; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h3, s3 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: fcvt s1, h1 +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fcvt s4, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fcvt s3, h3 +; LSE-NEXT: fmaxnm s4, s4, s2 +; LSE-NEXT: fmaxnm s3, s3, s1 +; LSE-NEXT: fcvt h4, s4 +; LSE-NEXT: fcvt h3, s3 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: ubfx w12, w10, #16, #1 +; NOLSE-NEXT: add w10, w10, w8 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: add w10, w12, w10 +; NOLSE-NEXT: lsr w10, w10, #16 +; NOLSE-NEXT: ubfx w11, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: add w9, w11, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w11, [x0] +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s1 +; LSE-NEXT: fmov s2, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmaxnm s4, s4, s2 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmaxnm s3, s3, s1 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: ubfx w12, w10, #16, #1 +; LSE-NEXT: add w10, w10, w8 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: add w10, w12, w10 +; LSE-NEXT: lsr w10, w10, #16 +; LSE-NEXT: ubfx w11, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: add w9, w11, w9 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: casal w11, w10, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w9 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll new file mode 100644 index 00000000000000..2697dbf5b2191d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -0,0 +1,1272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +; FIXME: Windows hosts assigns stack slots to different offsets for some reason. +; UNSUPPORTED: system-windows + +define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fminnm s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fminnm d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl fminl +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl fminl +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl fminl +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s1, h1 +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fcvt s4, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fcvt s3, h3 +; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h3, s3 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: fcvt s1, h1 +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fcvt s4, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fcvt s3, h3 +; LSE-NEXT: fminnm s4, s4, s2 +; LSE-NEXT: fminnm s3, s3, s1 +; LSE-NEXT: fcvt h4, s4 +; LSE-NEXT: fcvt h3, s3 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: ubfx w12, w10, #16, #1 +; NOLSE-NEXT: add w10, w10, w8 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: add w10, w12, w10 +; NOLSE-NEXT: lsr w10, w10, #16 +; NOLSE-NEXT: ubfx w11, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: add w9, w11, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w11, [x0] +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s1 +; LSE-NEXT: fmov s2, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fminnm s4, s4, s2 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fminnm s3, s3, s1 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: ubfx w12, w10, #16, #1 +; LSE-NEXT: add w10, w10, w8 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: add w10, w12, w10 +; LSE-NEXT: lsr w10, w10, #16 +; LSE-NEXT: ubfx w11, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: add w9, w11, w9 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: casal w11, w10, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w9 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fminnm v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fminnm v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll new file mode 100644 index 00000000000000..f41ddcb81d5ca5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -0,0 +1,1209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fsub s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fsub d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl __subtf3 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl __subtf3 +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl __subtf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvtl v1.4s, v0.4h +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: fcvtl v2.4s, v0.4h +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub v2.4s, v2.4s, v1.4s +; NOLSE-NEXT: fcvtn v2.4h, v2.4s +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvtl v1.4s, v0.4h +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvtl v2.4s, v0.4h +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub v2.4s, v2.4s, v1.4s +; LSE-NEXT: fcvtn v2.4h, v2.4s +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: movi v1.4s, #1 +; NOLSE-NEXT: movi v2.4s, #127, msl #8 +; NOLSE-NEXT: shll v3.4s, v0.4h, #16 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: shll v4.4s, v0.4h, #16 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub v4.4s, v4.4s, v3.4s +; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 +; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b +; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s +; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: movi v1.4s, #1 +; LSE-NEXT: movi v2.4s, #127, msl #8 +; LSE-NEXT: shll v3.4s, v0.4h, #16 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: shll v4.4s, v0.4h, #16 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: fsub v4.4s, v4.4s, v3.4s +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: ushr v5.4s, v4.4s, #16 +; LSE-NEXT: and v5.16b, v5.16b, v1.16b +; LSE-NEXT: add v4.4s, v5.4s, v4.4s +; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fsub v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fsub v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } From 5048fabb0579f1417f69cde49221b5b9e9c15414 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 29 Aug 2024 08:40:15 -0700 Subject: [PATCH 06/72] [Support] Delete FormatVariadicTest Validate sub-test (#106570) - The subtest, if enabled correctly, will fail with assert in Debug builds and validation is disabled in Release builds. - Hence deleting the test to fix test failures in CI. --- llvm/unittests/Support/FormatVariadicTest.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index 4c648d87fc2de7..6ee0d924867419 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -710,16 +710,6 @@ TEST(FormatVariadicTest, FormatFilterRange) { EXPECT_EQ("1, 2, 3", formatv("{0}", Range).str()); } -#ifdef NDEBUG // Disable the test in debug builds where it will assert. -TEST(FormatVariadicTest, Validate) { - std::string Str = formatv("{0}", 1, 2).str(); - EXPECT_THAT(Str, HasSubstr("Unexpected number of arguments")); - - Str = formatv("{0} {2}", 1, 2, 3).str(); - EXPECT_THAT(Str, HasSubstr("eplacement indices have holes")); -} -#endif // NDEBUG - namespace { enum class Base { First }; From 26c3a8404f1b3327a0982faeeaee94b08d1ee481 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 29 Aug 2024 19:48:51 +0400 Subject: [PATCH 07/72] AArch64: Use consistent atomicrmw expansion for FP operations (#103702) Use LLSC or cmpxchg in the same cases as for the unsupported integer operations. This required some fixups to the LLSC implementatation to deal with the fp128 case. The comment about floating-point exceptions was wrong, because floating-point exceptions are not really exceptions at all. --- .../Target/AArch64/AArch64ISelLowering.cpp | 59 ++- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 330 ++++------------ llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 356 +++++------------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 356 +++++------------- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 330 ++++------------ 5 files changed, 376 insertions(+), 1055 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 02390e0a85c0a5..a49dfdb28a41eb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27096,21 +27096,37 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { : AtomicExpansionKind::LLSC; } +// Return true if the atomic operation expansion will lower to use a library +// call, and is thus ineligible to use an LLSC expansion. +static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, + const AtomicRMWInst *RMW) { + if (!RMW->isFloatingPointOperation()) + return false; + switch (RMW->getType()->getScalarType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::HalfTyID: + case Type::BFloatTyID: + // Will use soft float + return !Subtarget.hasFPARMv8(); + default: + // fp128 will emit library calls. + return true; + } + + llvm_unreachable("covered type switch"); +} + // The "default" for integer RMW operations is to expand to an LL/SC loop. // However, with the LSE instructions (or outline-atomics mode, which provides // library routines in place of the LSE-instructions), we can directly emit many // operations instead. -// -// Floating-point operations are always emitted to a cmpxchg loop, because they -// may trigger a trap which aborts an LLSC sequence. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { - unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + Type *Ty = AI->getType(); + unsigned Size = Ty->getPrimitiveSizeInBits(); assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes."); - if (AI->isFloatingPointOperation()) - return AtomicExpansionKind::CmpXChg; - bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && (AI->getOperation() == AtomicRMWInst::Xchg || AI->getOperation() == AtomicRMWInst::Or || @@ -27120,7 +27136,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // Nand is not supported in LSE. // Leave 128 bits to LLSC or CmpXChg. - if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 && + !AI->isFloatingPointOperation()) { if (Subtarget->hasLSE()) return AtomicExpansionKind::None; if (Subtarget->outlineAtomics()) { @@ -27146,7 +27163,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if // we have a single CAS instruction that can replace the loop. if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None || - Subtarget->hasLSE()) + Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI)) return AtomicExpansionKind::CmpXChg; return AtomicExpansionKind::LLSC; @@ -27193,10 +27210,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); - Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); - Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); - return Builder.CreateOr( - Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); + + auto *Int128Ty = Type::getInt128Ty(Builder.getContext()); + Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64"); + Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64"); + + Value *Or = Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64"); + return Builder.CreateBitCast(Or, ValueTy); } Type *Tys[] = { Addr->getType() }; @@ -27207,8 +27228,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); CallInst *CI = Builder.CreateCall(Ldxr, Addr); - CI->addParamAttr( - 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); + CI->addParamAttr(0, Attribute::get(Builder.getContext(), + Attribute::ElementType, IntEltTy)); Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); return Builder.CreateBitCast(Trunc, ValueTy); @@ -27234,9 +27255,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Type *Int128Ty = Type::getInt128Ty(M->getContext()); + + Value *CastVal = Builder.CreateBitCast(Val, Int128Ty); - Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); - Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo"); + Value *Hi = + Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi"); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 89c9880ffc7868..0d230bb9dcc6e9 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -7,33 +7,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -108,33 +92,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -211,19 +179,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -232,21 +193,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -325,19 +274,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -346,21 +288,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -436,31 +366,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fadd s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -523,31 +437,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fadd d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -748,35 +646,19 @@ define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value) define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: fcvtl v0.4s, v0.4h ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 +; NOLSE-NEXT: fcvtl v1.4s, v1.4h +; NOLSE-NEXT: fadd v1.4s, v1.4s, v0.4s +; NOLSE-NEXT: fcvtn v1.4h, v1.4s +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: @@ -867,38 +749,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE: // %bb.0: ; NOLSE-NEXT: movi v1.4s, #1 ; NOLSE-NEXT: movi v2.4s, #127, msl #8 -; NOLSE-NEXT: shll v3.4s, v0.4h, #16 -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: shll v4.4s, v0.4h, #16 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s -; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 -; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b -; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s -; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s3, w8 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fadd v3.4s, v3.4s, v0.4s +; NOLSE-NEXT: ushr v4.4s, v3.4s, #16 +; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b +; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s +; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: @@ -984,31 +850,15 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -1086,43 +936,17 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fadd v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 998d8ae0c1de4d..12a0c1169f2b6a 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -10,33 +10,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -111,33 +95,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -214,19 +182,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fmaxnm s2, s2, s1 @@ -235,21 +196,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -328,19 +277,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fmaxnm s2, s2, s1 @@ -349,21 +291,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -439,31 +369,15 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fmaxnm s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -526,31 +440,15 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fmaxnm d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -753,41 +651,25 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 ; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: b .LBB7_2 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fcvt s4, h0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s2, w8 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fcvt s2, h2 ; NOLSE-NEXT: fcvt s3, h3 -; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmaxnm s2, s2, s0 ; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fcvt h3, s3 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: mov v2.h[1], v3.h[0] +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: @@ -888,58 +770,42 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fmov w10, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr s0, [x0] ; NOLSE-NEXT: lsl w10, w10, #16 ; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w9, [x0] +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fmov w11, s2 +; NOLSE-NEXT: lsl w11, w11, #16 +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s3 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmaxnm s4, s4, s2 -; NOLSE-NEXT: fmov s3, w9 ; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmaxnm s2, s2, s0 +; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: ubfx w13, w11, #16, #1 +; NOLSE-NEXT: add w11, w11, w8 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: add w11, w13, w11 +; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 ; NOLSE-NEXT: add w10, w10, w8 -; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: add w10, w12, w10 ; NOLSE-NEXT: lsr w10, w10, #16 -; NOLSE-NEXT: ubfx w11, w9, #16, #1 -; NOLSE-NEXT: add w9, w9, w8 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: add w9, w11, w9 -; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s3, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w10, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w11, [x0] -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: mov v3.h[1], v2.h[0] +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: stlxr w11, w10, [x0] +; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: @@ -1047,31 +913,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -1149,43 +999,17 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 2697dbf5b2191d..71765f435d94cf 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -10,33 +10,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -111,33 +95,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -214,19 +182,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fminnm s2, s2, s1 @@ -235,21 +196,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -328,19 +277,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fminnm s2, s2, s1 @@ -349,21 +291,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -439,31 +369,15 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fminnm s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -526,31 +440,15 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fminnm d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -753,41 +651,25 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 ; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: b .LBB7_2 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fcvt s4, h0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s2, w8 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fcvt s2, h2 ; NOLSE-NEXT: fcvt s3, h3 -; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fminnm s2, s2, s0 ; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fcvt h3, s3 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: mov v2.h[1], v3.h[0] +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: @@ -888,58 +770,42 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fmov w10, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr s0, [x0] ; NOLSE-NEXT: lsl w10, w10, #16 ; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w9, [x0] +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fmov w11, s2 +; NOLSE-NEXT: lsl w11, w11, #16 +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s3 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fminnm s4, s4, s2 -; NOLSE-NEXT: fmov s3, w9 ; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fminnm s2, s2, s0 +; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: ubfx w13, w11, #16, #1 +; NOLSE-NEXT: add w11, w11, w8 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: add w11, w13, w11 +; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 ; NOLSE-NEXT: add w10, w10, w8 -; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: add w10, w12, w10 ; NOLSE-NEXT: lsr w10, w10, #16 -; NOLSE-NEXT: ubfx w11, w9, #16, #1 -; NOLSE-NEXT: add w9, w9, w8 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: add w9, w11, w9 -; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s3, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w10, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w11, [x0] -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: mov v3.h[1], v2.h[0] +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: stlxr w11, w10, [x0] +; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: @@ -1047,31 +913,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fminnm v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -1149,43 +999,17 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fminnm v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index f41ddcb81d5ca5..67e164037d5ce7 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -7,33 +7,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -108,33 +92,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -211,19 +179,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fsub s2, s2, s1 @@ -232,21 +193,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -325,19 +274,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fsub s2, s2, s1 @@ -346,21 +288,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -436,31 +366,15 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fsub s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -523,31 +437,15 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fsub d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -748,35 +646,19 @@ define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value) define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: fcvtl v0.4s, v0.4h ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fsub v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 +; NOLSE-NEXT: fcvtl v1.4s, v1.4h +; NOLSE-NEXT: fsub v1.4s, v1.4s, v0.4s +; NOLSE-NEXT: fcvtn v1.4h, v1.4s +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: @@ -867,38 +749,22 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE: // %bb.0: ; NOLSE-NEXT: movi v1.4s, #1 ; NOLSE-NEXT: movi v2.4s, #127, msl #8 -; NOLSE-NEXT: shll v3.4s, v0.4h, #16 -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: shll v4.4s, v0.4h, #16 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fsub v4.4s, v4.4s, v3.4s -; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 -; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b -; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s -; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s3, w8 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fsub v3.4s, v3.4s, v0.4s +; NOLSE-NEXT: ushr v4.4s, v3.4s, #16 +; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b +; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s +; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: @@ -984,31 +850,15 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fsub v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -1086,43 +936,17 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fsub v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; From b5a1b45fe321cdf57d1b6155ecbbc18b6f95502f Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 08:54:58 -0700 Subject: [PATCH 08/72] [SLP] Early return in getReorderingData [nfc] --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fe57dbfc93d3e7..81811e0a4d9295 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5226,6 +5226,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { !TE.isAltShuffle()) return TE.ReorderIndices; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { + if (!TE.ReorderIndices.empty()) + return TE.ReorderIndices; + auto PHICompare = [&](unsigned I1, unsigned I2) { Value *V1 = TE.Scalars[I1]; Value *V2 = TE.Scalars[I2]; @@ -5259,8 +5262,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return false; return true; }; - if (!TE.ReorderIndices.empty()) - return TE.ReorderIndices; DenseMap PhiToId; SmallVector Phis(TE.Scalars.size()); std::iota(Phis.begin(), Phis.end(), 0); From a9ffb719bc323588b6b60fbf227db8104a81310e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Aug 2024 09:07:51 -0700 Subject: [PATCH 09/72] [RISCV] Don't promote f16 FNEG/FABS with Zfhmin/Zhinxmin. (#106474) fneg/fabs are not supposed to canonicalize nans. Promoting to f32 will go through an fp_extend which will canonicalize. The generic Promote handler needs to be removed from LegalizeDAG. We need to use integer bit manip to clear the bit instead. Unfortunately, this is going through the stack due to i16 not being a legal type. Fixing that will require custom legalization or some other generic SelectionDAG change. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 +- llvm/test/CodeGen/RISCV/bfloat-arith.ll | 603 +++++++++++---- llvm/test/CodeGen/RISCV/half-arith-strict.ll | 631 ++++++++++----- llvm/test/CodeGen/RISCV/half-arith.ll | 724 ++++++++++-------- .../RISCV/half-bitmanip-dagcombines.ll | 70 +- llvm/test/CodeGen/RISCV/half-intrinsics.ll | 27 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 189 ++++- 7 files changed, 1533 insertions(+), 720 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dbb9241fe8cad2..09928dcc1f489a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -395,7 +395,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, - ISD::FABS, ISD::FNEG, ISD::STRICT_FMA, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, @@ -416,8 +415,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::bf16, Expand); setOperationAction(ZfhminZfbfminPromoteOps, MVT::bf16, Promote); setOperationAction(ISD::FREM, MVT::bf16, Promote); - // FIXME: Need to promote bf16 FCOPYSIGN to f32, but the - // DAGCombiner::visitFP_ROUND probably needs improvements first. + setOperationAction(ISD::FABS, MVT::bf16, Expand); + setOperationAction(ISD::FNEG, MVT::bf16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); } @@ -433,8 +432,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::STRICT_LRINT, ISD::STRICT_LLRINT, ISD::STRICT_LROUND, ISD::STRICT_LLROUND}, MVT::f16, Legal); - // FIXME: Need to promote f16 FCOPYSIGN to f32, but the - // DAGCombiner::visitFP_ROUND probably needs improvements first. + setOperationAction(ISD::FABS, MVT::f16, Expand); + setOperationAction(ISD::FNEG, MVT::f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); } diff --git a/llvm/test/CodeGen/RISCV/bfloat-arith.ll b/llvm/test/CodeGen/RISCV/bfloat-arith.ll index 632e933c595671..56a30dd0f6ffee 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-arith.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-arith.ll @@ -105,17 +105,39 @@ define bfloat @fsgnj_s(bfloat %a, bfloat %b) nounwind { } define i32 @fneg_s(bfloat %a, bfloat %b) nounwind { -; CHECK-LABEL: fneg_s: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fadd.s fa5, fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: feq.s a0, fa5, fa4 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fneg_s: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa4, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV32IZFBFMIN-NEXT: feq.s a0, fa5, fa4 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fneg_s: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa4, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV64IZFBFMIN-NEXT: feq.s a0, fa5, fa4 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %1 = fadd bfloat %a, %a %2 = fneg bfloat %1 %3 = fcmp oeq bfloat %1, %2 @@ -131,9 +153,11 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind { ; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV32IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 -; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 -; RV32IZFBFMIN-NEXT: fneg.s fa5, fa5 -; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 4(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 5(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 5(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 4(sp) ; RV32IZFBFMIN-NEXT: fsh fa0, 8(sp) ; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) ; RV32IZFBFMIN-NEXT: lbu a0, 9(sp) @@ -148,24 +172,26 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind { ; ; RV64IZFBFMIN-LABEL: fsgnjn_s: ; RV64IZFBFMIN: # %bb.0: -; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: addi sp, sp, -32 ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV64IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 -; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 -; RV64IZFBFMIN-NEXT: fneg.s fa5, fa5 -; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 -; RV64IZFBFMIN-NEXT: fsh fa0, 0(sp) ; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) -; RV64IZFBFMIN-NEXT: lbu a0, 1(sp) -; RV64IZFBFMIN-NEXT: lbu a1, 9(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fsh fa0, 16(sp) +; RV64IZFBFMIN-NEXT: fsh fa5, 24(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 17(sp) +; RV64IZFBFMIN-NEXT: lbu a1, 25(sp) ; RV64IZFBFMIN-NEXT: andi a0, a0, 127 ; RV64IZFBFMIN-NEXT: andi a1, a1, 128 ; RV64IZFBFMIN-NEXT: or a0, a0, a1 -; RV64IZFBFMIN-NEXT: sb a0, 1(sp) -; RV64IZFBFMIN-NEXT: flh fa0, 0(sp) -; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: sb a0, 17(sp) +; RV64IZFBFMIN-NEXT: flh fa0, 16(sp) +; RV64IZFBFMIN-NEXT: addi sp, sp, 32 ; RV64IZFBFMIN-NEXT: ret %1 = fadd bfloat %a, %b %2 = fneg bfloat %1 @@ -176,19 +202,43 @@ define bfloat @fsgnjn_s(bfloat %a, bfloat %b) nounwind { declare bfloat @llvm.fabs.bf16(bfloat) define bfloat @fabs_s(bfloat %a, bfloat %b) nounwind { -; CHECK-LABEL: fabs_s: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 -; CHECK-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fabs.s fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fabs_s: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: andi a0, a0, 127 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa4, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fabs_s: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: andi a0, a0, 127 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa4, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %1 = fadd bfloat %a, %b %2 = call bfloat @llvm.fabs.bf16(bfloat %1) %3 = fadd bfloat %2, %1 @@ -239,21 +289,45 @@ define bfloat @fmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fmsub_s: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa2 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fmsub_s: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fmsub_s: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %c_ = fadd bfloat 0.0, %c ; avoid negation using xor %negc = fsub bfloat -0.0, %c_ %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %negc) @@ -261,27 +335,61 @@ define bfloat @fmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fnmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmadd_s: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 -; CHECK-NEXT: fadd.s fa4, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fneg.s fa4, fa4 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 -; CHECK-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmadd_s: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 9(sp) +; RV32IZFBFMIN-NEXT: flh fa4, 8(sp) +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmadd_s: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 0(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 1(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 1(sp) +; RV64IZFBFMIN-NEXT: flh fa4, 0(sp) +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %a_ = fadd bfloat 0.0, %a %c_ = fadd bfloat 0.0, %c %nega = fsub bfloat -0.0, %a_ @@ -291,27 +399,61 @@ define bfloat @fnmadd_s(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmadd_s_2: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 -; CHECK-NEXT: fadd.s fa4, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fneg.s fa4, fa4 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmadd_s_2: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 9(sp) +; RV32IZFBFMIN-NEXT: flh fa4, 8(sp) +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmadd_s_2: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 0(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 1(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 1(sp) +; RV64IZFBFMIN-NEXT: flh fa4, 0(sp) +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa4 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %b_ = fadd bfloat 0.0, %b %c_ = fadd bfloat 0.0, %c %negb = fsub bfloat -0.0, %b_ @@ -321,17 +463,37 @@ define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fnmadd_s_3(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmadd_s_3: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmadd_s_3: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa0, 12(sp) +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmadd_s_3: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa0, 8(sp) +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %neg = fneg bfloat %1 ret bfloat %neg @@ -339,38 +501,82 @@ define bfloat @fnmadd_s_3(bfloat %a, bfloat %b, bfloat %c) nounwind { define bfloat @fnmadd_nsz(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmadd_nsz: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmadd_nsz: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa0, 12(sp) +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmadd_nsz: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa0, 8(sp) +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %1 = call nsz bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %neg = fneg nsz bfloat %1 ret bfloat %neg } define bfloat @fnmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmsub_s: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 -; CHECK-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmsub_s: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa2 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmsub_s: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa2 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %a_ = fadd bfloat 0.0, %a %nega = fsub bfloat -0.0, %a_ %1 = call bfloat @llvm.fma.bf16(bfloat %nega, bfloat %b, bfloat %c) @@ -378,21 +584,45 @@ define bfloat @fnmsub_s(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fnmsub_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmsub_s_2: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmsub_s_2: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa2 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmsub_s_2: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa2 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa0 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %b_ = fadd bfloat 0.0, %b %negb = fsub bfloat -0.0, %b_ %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %negb, bfloat %c) @@ -439,30 +669,63 @@ define bfloat @fmsub_s_contract(bfloat %a, bfloat %b, bfloat %c) nounwind { } define bfloat @fnmadd_s_contract(bfloat %a, bfloat %b, bfloat %c) nounwind { -; CHECK-LABEL: fnmadd_s_contract: -; CHECK: # %bb.0: -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 -; CHECK-NEXT: fadd.s fa3, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa3, fa3 -; CHECK-NEXT: fcvt.s.bf16 fa2, fa2 -; CHECK-NEXT: fadd.s fa4, fa2, fa4 -; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa3 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fmul.s fa5, fa5, fa3 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fneg.s fa5, fa5 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fsub.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: ret +; RV32IZFBFMIN-LABEL: fnmadd_s_contract: +; RV32IZFBFMIN: # %bb.0: +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV32IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV32IZFBFMIN-NEXT: fadd.s fa3, fa3, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa3, fa3 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa3 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fmul.s fa5, fa5, fa3 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFBFMIN-NEXT: lbu a0, 13(sp) +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV32IZFBFMIN-NEXT: xori a0, a0, 128 +; RV32IZFBFMIN-NEXT: sb a0, 13(sp) +; RV32IZFBFMIN-NEXT: flh fa3, 12(sp) +; RV32IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa3 +; RV32IZFBFMIN-NEXT: fsub.s fa5, fa4, fa5 +; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; RV64IZFBFMIN-LABEL: fnmadd_s_contract: +; RV64IZFBFMIN: # %bb.0: +; RV64IZFBFMIN-NEXT: addi sp, sp, -16 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV64IZFBFMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa1 +; RV64IZFBFMIN-NEXT: fadd.s fa3, fa3, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa3, fa3 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa3, fa3 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fmul.s fa5, fa5, fa3 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFBFMIN-NEXT: lbu a0, 9(sp) +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa2 +; RV64IZFBFMIN-NEXT: xori a0, a0, 128 +; RV64IZFBFMIN-NEXT: sb a0, 9(sp) +; RV64IZFBFMIN-NEXT: flh fa3, 8(sp) +; RV64IZFBFMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa5 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa3 +; RV64IZFBFMIN-NEXT: fsub.s fa5, fa4, fa5 +; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 +; RV64IZFBFMIN-NEXT: addi sp, sp, 16 +; RV64IZFBFMIN-NEXT: ret %a_ = fadd bfloat 0.0, %a ; avoid negation using xor %b_ = fadd bfloat 0.0, %b ; avoid negation using xor %c_ = fadd bfloat 0.0, %c ; avoid negation using xor diff --git a/llvm/test/CodeGen/RISCV/half-arith-strict.ll b/llvm/test/CodeGen/RISCV/half-arith-strict.ll index 02cd91c7075940..4c7096f4045e2b 100644 --- a/llvm/test/CodeGen/RISCV/half-arith-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-arith-strict.ll @@ -11,16 +11,16 @@ ; RUN: | FileCheck -check-prefix=CHECK-ZHINX %s ; RUN: llc -mtriple=riscv32 -mattr=+zfhmin -verify-machineinstrs \ ; RUN: -disable-strictnode-mutation -target-abi ilp32f < %s \ -; RUN: | FileCheck -check-prefix=CHECK-ZFHMIN %s +; RUN: | FileCheck -check-prefixes=CHECK-ZFHMIN,CHECK-ZFHMIN-RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+zfhmin -verify-machineinstrs \ ; RUN: -disable-strictnode-mutation -target-abi lp64f < %s \ -; RUN: | FileCheck -check-prefix=CHECK-ZFHMIN %s +; RUN: | FileCheck -check-prefixes=CHECK-ZFHMIN,CHECK-ZFHMIN-RV64 %s ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \ ; RUN: -disable-strictnode-mutation -target-abi ilp32 < %s \ -; RUN: | FileCheck -check-prefix=CHECK-ZHINXMIN %s +; RUN: | FileCheck -check-prefixes=CHECK-ZHINXMIN,CHECK-ZHINXMIN-RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \ ; RUN: -disable-strictnode-mutation -target-abi lp64 < %s \ -; RUN: | FileCheck -check-prefix=CHECK-ZHINXMIN %s +; RUN: | FileCheck -check-prefixes=CHECK-ZHINXMIN,CHECK-ZHINXMIN-RV64 %s ; FIXME: We can't test without Zfh because soft promote legalization isn't ; implemented in SelectionDAG for STRICT nodes. @@ -239,36 +239,83 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINX-NEXT: fmsub.h a0, a0, a1, a2 ; CHECK-ZHINX-NEXT: ret ; -; CHECK-ZFHMIN-LABEL: fmsub_h: -; CHECK-ZFHMIN: # %bb.0: -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa2 -; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECK-ZFHMIN-NEXT: ret -; -; CHECK-ZHINXMIN-LABEL: fmsub_h: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECK-ZFHMIN-RV32-LABEL: fmsub_h: +; CHECK-ZFHMIN-RV32: # %bb.0: +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV32-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa4, fa1 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV32-NEXT: ret +; +; CHECK-ZFHMIN-RV64-LABEL: fmsub_h: +; CHECK-ZFHMIN-RV64: # %bb.0: +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV64-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa4, fa1 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV64-NEXT: ret +; +; CHECK-ZHINXMIN-RV32-LABEL: fmsub_h: +; CHECK-ZHINXMIN-RV32: # %bb.0: +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: sh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV32-NEXT: ret +; +; CHECK-ZHINXMIN-RV64-LABEL: fmsub_h: +; CHECK-ZHINXMIN-RV64: # %bb.0: +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: sh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV64-NEXT: ret %c_ = fadd half 0.0, %c ; avoid negation using xor %negc = fneg half %c_ %1 = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %negc, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp @@ -291,48 +338,115 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINX-NEXT: fnmadd.h a0, a0, a1, a2 ; CHECK-ZHINX-NEXT: ret ; -; CHECK-ZFHMIN-LABEL: fnmadd_h: -; CHECK-ZFHMIN: # %bb.0: -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 -; CHECK-ZFHMIN-NEXT: fadd.s fa4, fa3, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECK-ZFHMIN-NEXT: fneg.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECK-ZFHMIN-NEXT: ret -; -; CHECK-ZHINXMIN-LABEL: fnmadd_h: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECK-ZFHMIN-RV32-LABEL: fnmadd_h: +; CHECK-ZFHMIN-RV32: # %bb.0: +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 +; CHECK-ZFHMIN-RV32-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa4, 8(sp) +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa3, fa1 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa4, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV32-NEXT: ret +; +; CHECK-ZFHMIN-RV64-LABEL: fnmadd_h: +; CHECK-ZFHMIN-RV64: # %bb.0: +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 +; CHECK-ZFHMIN-RV64-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 0(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 1(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 1(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa4, 0(sp) +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa3, fa1 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV64-NEXT: ret +; +; CHECK-ZHINXMIN-RV32-LABEL: fnmadd_h: +; CHECK-ZHINXMIN-RV32: # %bb.0: +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a0, a0, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: sh a0, 8(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a0, 9(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a0, 9(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a0, 8(sp) +; CHECK-ZHINXMIN-RV32-NEXT: sh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV32-NEXT: ret +; +; CHECK-ZHINXMIN-RV64-LABEL: fnmadd_h: +; CHECK-ZHINXMIN-RV64: # %bb.0: +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a0, a0, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: sh a0, 0(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a0, 1(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a0, 1(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a0, 0(sp) +; CHECK-ZHINXMIN-RV64-NEXT: sh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV64-NEXT: ret %a_ = fadd half 0.0, %a %c_ = fadd half 0.0, %c %nega = fneg half %a_ @@ -357,48 +471,115 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINX-NEXT: fnmadd.h a0, a1, a0, a2 ; CHECK-ZHINX-NEXT: ret ; -; CHECK-ZFHMIN-LABEL: fnmadd_h_2: -; CHECK-ZFHMIN: # %bb.0: -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa1 -; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 -; CHECK-ZFHMIN-NEXT: fadd.s fa4, fa3, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECK-ZFHMIN-NEXT: fneg.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECK-ZFHMIN-NEXT: ret -; -; CHECK-ZHINXMIN-LABEL: fnmadd_h_2: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECK-ZFHMIN-RV32-LABEL: fnmadd_h_2: +; CHECK-ZFHMIN-RV32: # %bb.0: +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa1 +; CHECK-ZFHMIN-RV32-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa4, 8(sp) +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa4, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV32-NEXT: ret +; +; CHECK-ZFHMIN-RV64-LABEL: fnmadd_h_2: +; CHECK-ZFHMIN-RV64: # %bb.0: +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa1 +; CHECK-ZFHMIN-RV64-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 0(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 1(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa2 +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 1(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa4, 0(sp) +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa4, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV64-NEXT: ret +; +; CHECK-ZHINXMIN-RV32-LABEL: fnmadd_h_2: +; CHECK-ZHINXMIN-RV32: # %bb.0: +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a1, a1, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: sh a1, 8(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a1, 9(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: xori a1, a1, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a1, 9(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a1, 8(sp) +; CHECK-ZHINXMIN-RV32-NEXT: sh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a2, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a2, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV32-NEXT: ret +; +; CHECK-ZHINXMIN-RV64-LABEL: fnmadd_h_2: +; CHECK-ZHINXMIN-RV64: # %bb.0: +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a1, a1, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: sh a1, 0(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a1, 1(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: xori a1, a1, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a1, 1(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a1, 0(sp) +; CHECK-ZHINXMIN-RV64-NEXT: sh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: xori a2, a2, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a2, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a2, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV64-NEXT: ret %b_ = fadd half 0.0, %b %c_ = fadd half 0.0, %c %negb = fneg half %b_ @@ -421,36 +602,83 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINX-NEXT: fnmsub.h a0, a0, a1, a2 ; CHECK-ZHINX-NEXT: ret ; -; CHECK-ZFHMIN-LABEL: fnmsub_h: -; CHECK-ZFHMIN: # %bb.0: -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECK-ZFHMIN-NEXT: ret -; -; CHECK-ZHINXMIN-LABEL: fnmsub_h: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECK-ZFHMIN-RV32-LABEL: fnmsub_h: +; CHECK-ZFHMIN-RV32: # %bb.0: +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa0 +; CHECK-ZFHMIN-RV32-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa4, fa2 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa3, fa1 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV32-NEXT: ret +; +; CHECK-ZFHMIN-RV64-LABEL: fnmsub_h: +; CHECK-ZFHMIN-RV64: # %bb.0: +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa0 +; CHECK-ZFHMIN-RV64-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa4, fa2 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa3, fa1 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV64-NEXT: ret +; +; CHECK-ZHINXMIN-RV32-LABEL: fnmsub_h: +; CHECK-ZHINXMIN-RV32: # %bb.0: +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a0, a0, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: sh a0, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a0, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV32-NEXT: ret +; +; CHECK-ZHINXMIN-RV64-LABEL: fnmsub_h: +; CHECK-ZHINXMIN-RV64: # %bb.0: +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a0, a0, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: sh a0, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a0, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV64-NEXT: ret %a_ = fadd half 0.0, %a %nega = fneg half %a_ %1 = call half @llvm.experimental.constrained.fma.f16(half %nega, half %b, half %c, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp @@ -471,36 +699,83 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINX-NEXT: fnmsub.h a0, a1, a0, a2 ; CHECK-ZHINX-NEXT: ret ; -; CHECK-ZFHMIN-LABEL: fnmsub_h_2: -; CHECK-ZFHMIN: # %bb.0: -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa1 -; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECK-ZFHMIN-NEXT: ret -; -; CHECK-ZHINXMIN-LABEL: fnmsub_h_2: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECK-ZFHMIN-RV32-LABEL: fnmsub_h_2: +; CHECK-ZFHMIN-RV32: # %bb.0: +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa1 +; CHECK-ZFHMIN-RV32-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV32-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fsh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: lbu a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV32-NEXT: sb a0, 13(sp) +; CHECK-ZFHMIN-RV32-NEXT: flh fa5, 12(sp) +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa4, fa2 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV32-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-ZFHMIN-RV32-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV32-NEXT: ret +; +; CHECK-ZFHMIN-RV64-LABEL: fnmsub_h_2: +; CHECK-ZFHMIN-RV64: # %bb.0: +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa1 +; CHECK-ZFHMIN-RV64-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-RV64-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fsh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: lbu a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: xori a0, a0, 128 +; CHECK-ZFHMIN-RV64-NEXT: sb a0, 9(sp) +; CHECK-ZFHMIN-RV64-NEXT: flh fa5, 8(sp) +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa4, fa2 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa3, fa0 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.s.h fa5, fa5 +; CHECK-ZFHMIN-RV64-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-ZFHMIN-RV64-NEXT: fcvt.h.s fa0, fa5 +; CHECK-ZFHMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZFHMIN-RV64-NEXT: ret +; +; CHECK-ZHINXMIN-RV32-LABEL: fnmsub_h_2: +; CHECK-ZHINXMIN-RV32: # %bb.0: +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fadd.s a1, a1, zero +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: sh a1, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lbu a1, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: xori a1, a1, 128 +; CHECK-ZHINXMIN-RV32-NEXT: sb a1, 13(sp) +; CHECK-ZHINXMIN-RV32-NEXT: lh a1, 12(sp) +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV32-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV32-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV32-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV32-NEXT: ret +; +; CHECK-ZHINXMIN-RV64-LABEL: fnmsub_h_2: +; CHECK-ZHINXMIN-RV64: # %bb.0: +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, -16 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fadd.s a1, a1, zero +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: sh a1, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lbu a1, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: xori a1, a1, 128 +; CHECK-ZHINXMIN-RV64-NEXT: sb a1, 9(sp) +; CHECK-ZHINXMIN-RV64-NEXT: lh a1, 8(sp) +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-RV64-NEXT: fmadd.s a0, a0, a1, a2 +; CHECK-ZHINXMIN-RV64-NEXT: fcvt.h.s a0, a0 +; CHECK-ZHINXMIN-RV64-NEXT: addi sp, sp, 16 +; CHECK-ZHINXMIN-RV64-NEXT: ret %b_ = fadd half 0.0, %b %negb = fneg half %b_ %1 = call half @llvm.experimental.constrained.fma.f16(half %a, half %negb, half %c, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 10e63e3a9f7483..59981a282ab43e 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -630,29 +630,39 @@ define i32 @fneg_s(half %a, half %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fneg_s: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa4 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fneg_s: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa4, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV32-FSGNJ-NEXT: feq.s a0, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fneg_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fneg.s a1, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: feq.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fneg_s: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa4, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV64-FSGNJ-NEXT: feq.s a0, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fneg_s: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 @@ -771,9 +781,11 @@ define half @fsgnjn_s(half %a, half %b) nounwind { ; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa0 ; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 ; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fneg.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 4(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 5(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 5(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 4(sp) ; CHECK-RV32-FSGNJ-NEXT: fsh fa0, 8(sp) ; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) ; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) @@ -788,24 +800,26 @@ define half @fsgnjn_s(half %a, half %b) nounwind { ; ; CHECK-RV64-FSGNJ-LABEL: fsgnjn_s: ; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -32 ; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 ; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa0 ; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 ; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fneg.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa0, 0(sp) ; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a1, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fsh fa0, 16(sp) +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 24(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 17(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a1, 25(sp) ; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 127 ; CHECK-RV64-FSGNJ-NEXT: andi a1, a1, 128 ; CHECK-RV64-FSGNJ-NEXT: or a0, a0, a1 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa0, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 17(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa0, 16(sp) +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 32 ; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fsgnjn_s: ; CHECK-ZHINXMIN: # %bb.0: @@ -971,33 +985,43 @@ define half @fabs_s(half %a, half %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fabs_s: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fabs.s fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fabs_s: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa0 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: andi a0, a0, 127 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa4, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fabs_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fabs.s a1, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fadd.s a0, a1, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fabs_s: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa0 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 127 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa4, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fabs_s: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 @@ -1409,36 +1433,45 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fmsub_s: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fmsub_s: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmsub_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fneg.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fmsub_s: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fmsub_s: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 @@ -1591,48 +1624,61 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmadd_s: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 -; CHECKIZFHMIN-NEXT: fadd.s fa4, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fneg.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmadd_s: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa4, 8(sp) +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmadd_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fneg.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fneg.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fnmadd_s: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 0(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 1(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa4, 0(sp) +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fnmadd_s: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 @@ -1793,48 +1839,61 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmadd_s_2: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 -; CHECKIZFHMIN-NEXT: fadd.s fa4, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fneg.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_2: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa4, 8(sp) +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmadd_s_2: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fneg.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fneg.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_2: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 0(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 1(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa4, 0(sp) +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fnmadd_s_2: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 @@ -1959,17 +2018,37 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmadd_s_3: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_3: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa0, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret +; +; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_3: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa0, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; ; CHECKZHINXMIN-LABEL: fnmadd_s_3: ; CHECKZHINXMIN: # %bb.0: @@ -2090,17 +2169,37 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmadd_nsz: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmadd_nsz: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa0, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret +; +; CHECK-RV64-FSGNJ-LABEL: fnmadd_nsz: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa0, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; ; CHECKZHINXMIN-LABEL: fnmadd_nsz: ; CHECKZHINXMIN: # %bb.0: @@ -2227,36 +2326,45 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmsub_s: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmsub_s: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa2 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmsub_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fneg.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fnmsub_s: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa2 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fnmsub_s: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 @@ -2379,36 +2487,45 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmsub_s_2: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmsub_s_2: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa2 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmsub_s_2: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fneg.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fnmsub_s_2: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa2 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fnmsub_s_2: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 @@ -2847,54 +2964,63 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fnmadd_s_contract: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECKIZFHMIN-NEXT: fadd.s fa3, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa3, fa3 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa2, fa2 -; CHECKIZFHMIN-NEXT: fadd.s fa4, fa2, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa3 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fmul.s fa5, fa5, fa3 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fneg.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fsub.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_contract: +; CHECK-RV32-FSGNJ: # %bb.0: +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa3, fa3, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa3, fa3 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa3 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fmul.s fa5, fa5, fa3 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) +; CHECK-RV32-FSGNJ-NEXT: flh fa3, 12(sp) +; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa3 +; CHECK-RV32-FSGNJ-NEXT: fsub.s fa5, fa4, fa5 +; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV32-FSGNJ-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmadd_s_contract: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fneg.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECKZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret +; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_contract: +; CHECK-RV64-FSGNJ: # %bb.0: +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 +; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa3, fa3, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa3, fa3 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa3 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fmul.s fa5, fa5, fa3 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 +; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 +; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) +; CHECK-RV64-FSGNJ-NEXT: flh fa3, 8(sp) +; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa3 +; CHECK-RV64-FSGNJ-NEXT: fsub.s fa5, fa4, fa5 +; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 +; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 +; CHECK-RV64-FSGNJ-NEXT: ret ; CHECK-ZHINXMIN-LABEL: fnmadd_s_contract: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 diff --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll index a103a9e09d1498..c824e7f9845951 100644 --- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll +++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll @@ -208,13 +208,15 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV32IZFHMIN-LABEL: fcopysign_fneg: ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: addi sp, sp, -16 -; RV32IZFHMIN-NEXT: fmv.h.x fa5, a0 -; RV32IZFHMIN-NEXT: fmv.h.x fa4, a1 -; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; RV32IZFHMIN-NEXT: fneg.s fa4, fa4 -; RV32IZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; RV32IZFHMIN-NEXT: fsh fa5, 8(sp) -; RV32IZFHMIN-NEXT: fsh fa4, 12(sp) +; RV32IZFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32IZFHMIN-NEXT: fsh fa5, 4(sp) +; RV32IZFHMIN-NEXT: lbu a1, 5(sp) +; RV32IZFHMIN-NEXT: xori a1, a1, 128 +; RV32IZFHMIN-NEXT: sb a1, 5(sp) +; RV32IZFHMIN-NEXT: flh fa5, 4(sp) +; RV32IZFHMIN-NEXT: fmv.h.x fa4, a0 +; RV32IZFHMIN-NEXT: fsh fa4, 8(sp) +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) ; RV32IZFHMIN-NEXT: lbu a0, 9(sp) ; RV32IZFHMIN-NEXT: lbu a1, 13(sp) ; RV32IZFHMIN-NEXT: andi a0, a0, 127 @@ -228,31 +230,35 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; ; RV64IZFHMIN-LABEL: fcopysign_fneg: ; RV64IZFHMIN: # %bb.0: -; RV64IZFHMIN-NEXT: addi sp, sp, -16 -; RV64IZFHMIN-NEXT: fmv.h.x fa5, a0 -; RV64IZFHMIN-NEXT: fmv.h.x fa4, a1 -; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; RV64IZFHMIN-NEXT: fneg.s fa4, fa4 -; RV64IZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; RV64IZFHMIN-NEXT: fsh fa5, 0(sp) -; RV64IZFHMIN-NEXT: fsh fa4, 8(sp) -; RV64IZFHMIN-NEXT: lbu a0, 1(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, -32 +; RV64IZFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) ; RV64IZFHMIN-NEXT: lbu a1, 9(sp) +; RV64IZFHMIN-NEXT: xori a1, a1, 128 +; RV64IZFHMIN-NEXT: sb a1, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fmv.h.x fa4, a0 +; RV64IZFHMIN-NEXT: fsh fa4, 16(sp) +; RV64IZFHMIN-NEXT: fsh fa5, 24(sp) +; RV64IZFHMIN-NEXT: lbu a0, 17(sp) +; RV64IZFHMIN-NEXT: lbu a1, 25(sp) ; RV64IZFHMIN-NEXT: andi a0, a0, 127 ; RV64IZFHMIN-NEXT: andi a1, a1, 128 ; RV64IZFHMIN-NEXT: or a0, a0, a1 -; RV64IZFHMIN-NEXT: sb a0, 1(sp) -; RV64IZFHMIN-NEXT: flh fa5, 0(sp) +; RV64IZFHMIN-NEXT: sb a0, 17(sp) +; RV64IZFHMIN-NEXT: flh fa5, 16(sp) ; RV64IZFHMIN-NEXT: fmv.x.h a0, fa5 -; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: addi sp, sp, 32 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: fcopysign_fneg: ; RV32IZHINXMIN: # %bb.0: ; RV32IZHINXMIN-NEXT: addi sp, sp, -16 -; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV32IZHINXMIN-NEXT: fneg.s a1, a1 -; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: sh a1, 4(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 5(sp) +; RV32IZHINXMIN-NEXT: xori a1, a1, 128 +; RV32IZHINXMIN-NEXT: sb a1, 5(sp) +; RV32IZHINXMIN-NEXT: lh a1, 4(sp) ; RV32IZHINXMIN-NEXT: sh a0, 8(sp) ; RV32IZHINXMIN-NEXT: sh a1, 12(sp) ; RV32IZHINXMIN-NEXT: lbu a0, 9(sp) @@ -267,20 +273,22 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; ; RV64IZHINXMIN-LABEL: fcopysign_fneg: ; RV64IZHINXMIN: # %bb.0: -; RV64IZHINXMIN-NEXT: addi sp, sp, -16 -; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV64IZHINXMIN-NEXT: fneg.s a1, a1 -; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 -; RV64IZHINXMIN-NEXT: sh a0, 0(sp) +; RV64IZHINXMIN-NEXT: addi sp, sp, -32 ; RV64IZHINXMIN-NEXT: sh a1, 8(sp) -; RV64IZHINXMIN-NEXT: lbu a0, 1(sp) ; RV64IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV64IZHINXMIN-NEXT: xori a1, a1, 128 +; RV64IZHINXMIN-NEXT: sb a1, 9(sp) +; RV64IZHINXMIN-NEXT: lh a1, 8(sp) +; RV64IZHINXMIN-NEXT: sh a0, 16(sp) +; RV64IZHINXMIN-NEXT: sh a1, 24(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 17(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 25(sp) ; RV64IZHINXMIN-NEXT: andi a0, a0, 127 ; RV64IZHINXMIN-NEXT: andi a1, a1, 128 ; RV64IZHINXMIN-NEXT: or a0, a0, a1 -; RV64IZHINXMIN-NEXT: sb a0, 1(sp) -; RV64IZHINXMIN-NEXT: lh a0, 0(sp) -; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: sb a0, 17(sp) +; RV64IZHINXMIN-NEXT: lh a0, 16(sp) +; RV64IZHINXMIN-NEXT: addi sp, sp, 32 ; RV64IZHINXMIN-NEXT: ret %1 = fneg half %b %2 = call half @llvm.copysign.f16(half %a, half %1) diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index 4587c442cda5b3..7f1eebdf64a551 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -1821,12 +1821,27 @@ define half @fabs_f16(half %a) nounwind { ; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fabs_f16: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-NEXT: fabs.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fabs_f16: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fsh fa0, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: andi a0, a0, 127 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa0, 12(sp) +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fabs_f16: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fsh fa0, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: andi a0, a0, 127 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa0, 8(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: fabs_f16: ; RV32IZHINXMIN: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index cdbca0b874e607..fb9c0a57fd1bee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -650,38 +650,165 @@ define void @fabs_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fabs_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV32-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV32-NEXT: addi a1, a0, 8 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV32-NEXT: vse32.v v8, (a1) -; ZVFHMIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: ret +; ZVFHMIN-ZFH-RV32-LABEL: fabs_v6f16: +; ZVFHMIN-ZFH-RV32: # %bb.0: +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vfabs.v v8, v9 +; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, a0, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV32-NEXT: vse32.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: ret ; -; ZVFHMIN-RV64-LABEL: fabs_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-RV64-NEXT: vfabs.v v8, v9 -; ZVFHMIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vfncvt.f.f.w v9, v8 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse64.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi a0, a0, 8 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN-RV64-NEXT: vse32.v v8, (a0) -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-ZFH-RV64-LABEL: fabs_v6f16: +; ZVFHMIN-ZFH-RV64: # %bb.0: +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vfabs.v v8, v9 +; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFH-RV64-NEXT: vse64.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v8, v9, 2 +; ZVFHMIN-ZFH-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV32-LABEL: fabs_v6f16: +; ZVFHMIN-ZFHIN-RV32: # %bb.0: +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -64 +; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 64 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 36(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 32(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 37(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 37(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 33(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 33(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 29(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 29(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 25(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 25(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 21(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 21(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lbu a1, 17(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV32-NEXT: sb a1, 17(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 36(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 58(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 32(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 56(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 54(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 52(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa3, 50(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa2, 16(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa2, 48(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 48 +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa5, 46(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa4, 44(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa3, 42(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fsh fa2, 40(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 40 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v8, v8, 2 +; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV32-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 64 +; ZVFHMIN-ZFHIN-RV32-NEXT: ret +; +; ZVFHMIN-ZFHIN-RV64-LABEL: fabs_v6f16: +; ZVFHMIN-ZFHIN-RV64: # %bb.0: +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -80 +; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 56(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 48(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 40(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 32(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 57(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 57(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 49(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 49(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 41(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 41(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 33(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 33(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 25(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 25(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lbu a1, 17(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: andi a1, a1, 127 +; ZVFHMIN-ZFHIN-RV64-NEXT: sb a1, 17(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 56(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 74(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 48(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 72(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 40(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 70(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 32(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 68(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 66(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 16(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fsh fa5, 64(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 64 +; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFHMIN-ZFHIN-RV64-NEXT: vse64.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a0, a0, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v8, v8, 2 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse32.v v8, (a0) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 80 +; ZVFHMIN-ZFHIN-RV64-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x From e05c22484efb5c767115525adfa4273e48b1ae26 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 29 Aug 2024 20:12:14 +0400 Subject: [PATCH 10/72] AArch64: Delete tests of fp128 atomicrmw fmin/fmax These are getting different output on some build hosts for some reason. The stack offsets of temporaries are different. --- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 285 +++++--------------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 285 +++++--------------- 2 files changed, 146 insertions(+), 424 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 12a0c1169f2b6a..bfe0d20ca814bc 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -3,8 +3,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s -; FIXME: Windows hosts assigns stack slots to different offsets for some reason. -; UNSUPPORTED: system-windows +; FIXME: Restore test of fp128 case define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: @@ -508,144 +507,6 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ret double %res } -define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: -; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: bl fmaxl -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: -; LSE: // %bb.0: -; LSE-NEXT: sub sp, sp, #96 -; LSE-NEXT: ldr q1, [x0] -; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; LSE-NEXT: mov x19, x0 -; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; LSE-NEXT: .LBB6_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov v0.16b, v1.16b -; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; LSE-NEXT: bl fmaxl -; LSE-NEXT: str q0, [sp, #48] -; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; LSE-NEXT: ldp x0, x1, [sp, #48] -; LSE-NEXT: str q0, [sp, #64] -; LSE-NEXT: ldp x2, x3, [sp, #64] -; LSE-NEXT: mov x4, x2 -; LSE-NEXT: mov x5, x3 -; LSE-NEXT: caspal x4, x5, x0, x1, [x19] -; LSE-NEXT: stp x4, x5, [sp, #32] -; LSE-NEXT: cmp x5, x3 -; LSE-NEXT: ldr q1, [sp, #32] -; LSE-NEXT: ccmp x4, x2, #0, eq -; LSE-NEXT: b.ne .LBB6_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; LSE-NEXT: mov v0.16b, v1.16b -; LSE-NEXT: add sp, sp, #96 -; LSE-NEXT: ret -; -; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: -; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: mov x19, x3 -; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] -; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov x21, x2 -; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x1, x22 -; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 -; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x2, x21 -; SOFTFP-NOLSE-NEXT: mov x3, x19 -; SOFTFP-NOLSE-NEXT: mov x22, x1 -; SOFTFP-NOLSE-NEXT: mov x23, x0 -; SOFTFP-NOLSE-NEXT: bl fmaxl -; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: mov x9, x1 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cmp x0, x23 -; SOFTFP-NOLSE-NEXT: cset w10, ne -; SOFTFP-NOLSE-NEXT: cmp x1, x22 -; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne -; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ret - %res = atomicrmw fmax ptr %ptr, fp128 %value seq_cst, align 16 - ret fp128 %res -} - define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: @@ -653,7 +514,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w8, [x0] ; NOLSE-NEXT: fmov s2, w8 @@ -667,7 +528,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE-NEXT: mov v2.h[1], v3.h[0] ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: stlxr w10, w9, [x0] -; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: cbnz w10, .LBB6_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret @@ -679,7 +540,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; LSE-NEXT: fcvt s2, h0 ; LSE-NEXT: ldr s0, [x0] ; LSE-NEXT: fcvt s1, h1 -; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: .LBB6_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: fcvt s4, h0 @@ -695,7 +556,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; LSE-NEXT: casal w10, w9, [x0] ; LSE-NEXT: fmov s0, w10 ; LSE-NEXT: cmp w10, w8 -; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: b.ne .LBB6_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LSE-NEXT: ret @@ -711,16 +572,16 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 ; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -740,18 +601,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] ; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -775,7 +636,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 @@ -803,7 +664,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov v3.h[1], v2.h[0] ; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: stlxr w11, w10, [x0] -; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: cbnz w11, .LBB7_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret @@ -820,7 +681,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-NEXT: fmov s2, w10 ; LSE-NEXT: lsl w9, w9, #16 ; LSE-NEXT: fmov s1, w9 -; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: fmov w10, s0 @@ -850,7 +711,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-NEXT: casal w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 ; LSE-NEXT: cmp w11, w9 -; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: b.ne .LBB7_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LSE-NEXT: ret @@ -867,15 +728,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 ; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -888,18 +749,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] ; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -913,14 +774,14 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr x8, [x0] ; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s ; NOLSE-NEXT: fmov x8, d2 ; NOLSE-NEXT: stlxr w9, x8, [x0] -; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: cbnz w9, .LBB8_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret @@ -928,7 +789,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; LSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: ; LSE: // %bb.0: ; LSE-NEXT: ldr d1, [x0] -; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: .LBB8_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s ; LSE-NEXT: fmov x8, d1 @@ -937,7 +798,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; LSE-NEXT: casal x10, x9, [x0] ; LSE-NEXT: fmov d1, x10 ; LSE-NEXT: cmp x10, x8 -; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: b.ne .LBB8_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: fmov d0, d1 ; LSE-NEXT: ret @@ -952,16 +813,16 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 ; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 -; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf @@ -973,18 +834,18 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: mov w9, w0 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] ; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -999,7 +860,7 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxp x8, x9, [x0] ; NOLSE-NEXT: fmov d1, x8 @@ -1008,7 +869,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; NOLSE-NEXT: mov x8, v2.d[1] ; NOLSE-NEXT: fmov x9, d2 ; NOLSE-NEXT: stlxp w10, x9, x8, [x0] -; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: cbnz w10, .LBB9_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret @@ -1016,7 +877,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; LSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: ; LSE: // %bb.0: ; LSE-NEXT: ldr q1, [x0] -; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d ; LSE-NEXT: mov x3, v1.d[1] @@ -1030,7 +891,7 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; LSE-NEXT: cmp x7, x3 ; LSE-NEXT: ccmp x6, x2, #0, eq ; LSE-NEXT: mov v1.d[1], x7 -; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: b.ne .LBB9_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: mov v0.16b, v1.16b ; LSE-NEXT: ret @@ -1045,15 +906,15 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov x21, x2 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB10_2 -; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: cmp x1, x22 ; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq -; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 -; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_6 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x22, x1 ; SOFTFP-NOLSE-NEXT: mov x23, x0 ; SOFTFP-NOLSE-NEXT: mov x0, x1 @@ -1064,26 +925,26 @@ define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmax ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] ; SOFTFP-NOLSE-NEXT: cmp x0, x23 ; SOFTFP-NOLSE-NEXT: cset w9, ne ; SOFTFP-NOLSE-NEXT: cmp x1, x22 ; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB9_5 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 -; SOFTFP-NOLSE-NEXT: b .LBB10_1 -; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 -; SOFTFP-NOLSE-NEXT: b .LBB10_1 -; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 71765f435d94cf..6b7d2df044460a 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -3,8 +3,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s -; FIXME: Windows hosts assigns stack slots to different offsets for some reason. -; UNSUPPORTED: system-windows +; FIXME: Restore test of fp128 case define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: @@ -508,144 +507,6 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ret double %res } -define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: -; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: bl fminl -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: -; LSE: // %bb.0: -; LSE-NEXT: sub sp, sp, #96 -; LSE-NEXT: ldr q1, [x0] -; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; LSE-NEXT: mov x19, x0 -; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; LSE-NEXT: .LBB6_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov v0.16b, v1.16b -; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; LSE-NEXT: bl fminl -; LSE-NEXT: str q0, [sp, #48] -; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; LSE-NEXT: ldp x0, x1, [sp, #48] -; LSE-NEXT: str q0, [sp, #64] -; LSE-NEXT: ldp x2, x3, [sp, #64] -; LSE-NEXT: mov x4, x2 -; LSE-NEXT: mov x5, x3 -; LSE-NEXT: caspal x4, x5, x0, x1, [x19] -; LSE-NEXT: stp x4, x5, [sp, #32] -; LSE-NEXT: cmp x5, x3 -; LSE-NEXT: ldr q1, [sp, #32] -; LSE-NEXT: ccmp x4, x2, #0, eq -; LSE-NEXT: b.ne .LBB6_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; LSE-NEXT: mov v0.16b, v1.16b -; LSE-NEXT: add sp, sp, #96 -; LSE-NEXT: ret -; -; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: -; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: mov x19, x3 -; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] -; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov x21, x2 -; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x1, x22 -; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 -; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x2, x21 -; SOFTFP-NOLSE-NEXT: mov x3, x19 -; SOFTFP-NOLSE-NEXT: mov x22, x1 -; SOFTFP-NOLSE-NEXT: mov x23, x0 -; SOFTFP-NOLSE-NEXT: bl fminl -; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: mov x9, x1 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cmp x0, x23 -; SOFTFP-NOLSE-NEXT: cset w10, ne -; SOFTFP-NOLSE-NEXT: cmp x1, x22 -; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne -; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ret - %res = atomicrmw fmin ptr %ptr, fp128 %value seq_cst, align 16 - ret fp128 %res -} - define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: @@ -653,7 +514,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w8, [x0] ; NOLSE-NEXT: fmov s2, w8 @@ -667,7 +528,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE-NEXT: mov v2.h[1], v3.h[0] ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: stlxr w10, w9, [x0] -; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: cbnz w10, .LBB6_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret @@ -679,7 +540,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; LSE-NEXT: fcvt s2, h0 ; LSE-NEXT: ldr s0, [x0] ; LSE-NEXT: fcvt s1, h1 -; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: .LBB6_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: fcvt s4, h0 @@ -695,7 +556,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; LSE-NEXT: casal w10, w9, [x0] ; LSE-NEXT: fmov s0, w10 ; LSE-NEXT: cmp w10, w8 -; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: b.ne .LBB6_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LSE-NEXT: ret @@ -711,16 +572,16 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 ; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -740,18 +601,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] ; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -775,7 +636,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 @@ -803,7 +664,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov v3.h[1], v2.h[0] ; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: stlxr w11, w10, [x0] -; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: cbnz w11, .LBB7_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret @@ -820,7 +681,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-NEXT: fmov s2, w10 ; LSE-NEXT: lsl w9, w9, #16 ; LSE-NEXT: fmov s1, w9 -; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: fmov w10, s0 @@ -850,7 +711,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-NEXT: casal w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 ; LSE-NEXT: cmp w11, w9 -; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: b.ne .LBB7_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; LSE-NEXT: ret @@ -867,15 +728,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 ; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -888,18 +749,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] ; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -913,14 +774,14 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr x8, [x0] ; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fminnm v2.2s, v1.2s, v0.2s ; NOLSE-NEXT: fmov x8, d2 ; NOLSE-NEXT: stlxr w9, x8, [x0] -; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: cbnz w9, .LBB8_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret @@ -928,7 +789,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; LSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: ; LSE: // %bb.0: ; LSE-NEXT: ldr d1, [x0] -; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: .LBB8_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: fminnm v2.2s, v1.2s, v0.2s ; LSE-NEXT: fmov x8, d1 @@ -937,7 +798,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; LSE-NEXT: casal x10, x9, [x0] ; LSE-NEXT: fmov d1, x10 ; LSE-NEXT: cmp x10, x8 -; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: b.ne .LBB8_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: fmov d0, d1 ; LSE-NEXT: ret @@ -952,16 +813,16 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 ; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 -; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf @@ -973,18 +834,18 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: mov w9, w0 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] ; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -999,7 +860,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxp x8, x9, [x0] ; NOLSE-NEXT: fmov d1, x8 @@ -1008,7 +869,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; NOLSE-NEXT: mov x8, v2.d[1] ; NOLSE-NEXT: fmov x9, d2 ; NOLSE-NEXT: stlxp w10, x9, x8, [x0] -; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: cbnz w10, .LBB9_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret @@ -1016,7 +877,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; LSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: ; LSE: // %bb.0: ; LSE-NEXT: ldr q1, [x0] -; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 ; LSE-NEXT: fminnm v2.2d, v1.2d, v0.2d ; LSE-NEXT: mov x3, v1.d[1] @@ -1030,7 +891,7 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; LSE-NEXT: cmp x7, x3 ; LSE-NEXT: ccmp x6, x2, #0, eq ; LSE-NEXT: mov v1.d[1], x7 -; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: b.ne .LBB9_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end ; LSE-NEXT: mov v0.16b, v1.16b ; LSE-NEXT: ret @@ -1045,15 +906,15 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov x21, x2 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB10_2 -; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: cmp x1, x22 ; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq -; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 -; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_6 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x22, x1 ; SOFTFP-NOLSE-NEXT: mov x23, x0 ; SOFTFP-NOLSE-NEXT: mov x0, x1 @@ -1064,26 +925,26 @@ define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x dou ; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmin ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] ; SOFTFP-NOLSE-NEXT: cmp x0, x23 ; SOFTFP-NOLSE-NEXT: cset w9, ne ; SOFTFP-NOLSE-NEXT: cmp x1, x22 ; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB9_5 ; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 -; SOFTFP-NOLSE-NEXT: b .LBB10_1 -; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 -; SOFTFP-NOLSE-NEXT: b .LBB10_1 -; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload From c08c6a71cfc536e22fb7ad733fb8181a9e84e62a Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Thu, 29 Aug 2024 09:20:59 -0700 Subject: [PATCH 11/72] [mlir][scf] Allow unrolling loops with integer-typed IV. (#106164) SCF loops now can operate on integer-typed IV, thus I'm changing the loop unroller correspondingly. --- mlir/lib/Dialect/SCF/Utils/Utils.cpp | 35 +++++++++++++--------- mlir/test/Dialect/SCF/loop-unroll.mlir | 41 ++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 9545610f10be7c..a794a121d6267b 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -270,11 +270,13 @@ bool mlir::getInnermostParallelLoops(Operation *rootOp, static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor) { assert(divisor > 0 && "expected positive divisor"); - assert(dividend.getType().isIndex() && "expected index-typed value"); + assert(dividend.getType().isIntOrIndex() && + "expected integer or index-typed value"); - Value divisorMinusOneCst = - builder.create(loc, divisor - 1); - Value divisorCst = builder.create(loc, divisor); + Value divisorMinusOneCst = builder.create( + loc, builder.getIntegerAttr(dividend.getType(), divisor - 1)); + Value divisorCst = builder.create( + loc, builder.getIntegerAttr(dividend.getType(), divisor)); Value sum = builder.create(loc, dividend, divisorMinusOneCst); return builder.create(loc, sum, divisorCst); } @@ -285,9 +287,10 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, // where divis is rounding-to-zero division. static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, Value divisor) { - assert(dividend.getType().isIndex() && "expected index-typed value"); - - Value cstOne = builder.create(loc, 1); + assert(dividend.getType().isIntOrIndex() && + "expected integer or index-typed value"); + Value cstOne = builder.create( + loc, builder.getOneAttr(dividend.getType())); Value divisorMinusOne = builder.create(loc, divisor, cstOne); Value sum = builder.create(loc, dividend, divisorMinusOne); return builder.create(loc, sum, divisor); @@ -409,16 +412,18 @@ LogicalResult mlir::loopUnrollByFactor( // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. generateEpilogueLoop = upperBoundUnrolledCst < ubCst; if (generateEpilogueLoop) - upperBoundUnrolled = boundsBuilder.create( - loc, upperBoundUnrolledCst); + upperBoundUnrolled = boundsBuilder.create( + loc, boundsBuilder.getIntegerAttr(forOp.getUpperBound().getType(), + upperBoundUnrolledCst)); else upperBoundUnrolled = forOp.getUpperBound(); // Create constant for 'stepUnrolled'. stepUnrolled = stepCst == stepUnrolledCst ? step - : boundsBuilder.create( - loc, stepUnrolledCst); + : boundsBuilder.create( + loc, boundsBuilder.getIntegerAttr( + step.getType(), stepUnrolledCst)); } else { // Dynamic loop bounds computation. // TODO: Add dynamic asserts for negative lb/ub/step, or @@ -428,8 +433,8 @@ LogicalResult mlir::loopUnrollByFactor( Value diff = boundsBuilder.create(loc, upperBound, lowerBound); Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); - Value unrollFactorCst = - boundsBuilder.create(loc, unrollFactor); + Value unrollFactorCst = boundsBuilder.create( + loc, boundsBuilder.getIntegerAttr(tripCount.getType(), unrollFactor)); Value tripCountRem = boundsBuilder.create(loc, tripCount, unrollFactorCst); // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) @@ -476,7 +481,9 @@ LogicalResult mlir::loopUnrollByFactor( [&](unsigned i, Value iv, OpBuilder b) { // iv' = iv + step * i; auto stride = b.create( - loc, step, b.create(loc, i)); + loc, step, + b.create(loc, + b.getIntegerAttr(iv.getType(), i))); return b.create(loc, iv, stride); }, annotateFn, iterArgs, yieldedValues); diff --git a/mlir/test/Dialect/SCF/loop-unroll.mlir b/mlir/test/Dialect/SCF/loop-unroll.mlir index e28efbb6ec2b91..68a11fb6a72c64 100644 --- a/mlir/test/Dialect/SCF/loop-unroll.mlir +++ b/mlir/test/Dialect/SCF/loop-unroll.mlir @@ -448,3 +448,44 @@ func.func @loop_unroll_yield_iter_arg() { // CHECK-NEXT: affine.yield %[[ITER_ARG]] : index // CHECK-NEXT: } // CHECK-NEXT: return + +// ----- + +// Test the loop unroller works with integer IV type. +func.func @static_loop_unroll_with_integer_iv() -> (f32, f32) { + %0 = arith.constant 7.0 : f32 + %lb = arith.constant 0 : i32 + %ub = arith.constant 20 : i32 + %step = arith.constant 1 : i32 + %result:2 = scf.for %i0 = %lb to %ub step %step iter_args(%arg0 = %0, %arg1 = %0) -> (f32, f32) : i32{ + %add = arith.addf %arg0, %arg1 : f32 + %mul = arith.mulf %arg0, %arg1 : f32 + scf.yield %add, %mul : f32, f32 + } + return %result#0, %result#1 : f32, f32 +} +// UNROLL-BY-3-LABEL: func @static_loop_unroll_with_integer_iv +// +// UNROLL-BY-3-DAG: %[[CST:.*]] = arith.constant {{.*}} : f32 +// UNROLL-BY-3-DAG: %[[C0:.*]] = arith.constant 0 : i32 +// UNROLL-BY-3-DAG: %[[C1:.*]] = arith.constant 1 : i32 +// UNROLL-BY-3-DAG: %[[C20:.*]] = arith.constant 20 : i32 +// UNROLL-BY-3-DAG: %[[C18:.*]] = arith.constant 18 : i32 +// UNROLL-BY-3-DAG: %[[C3:.*]] = arith.constant 3 : i32 +// UNROLL-BY-3: %[[FOR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C18]] step %[[C3]] +// UNROLL-BY-3-SAME: iter_args(%[[ARG0:.*]] = %[[CST]], %[[ARG1:.*]] = %[[CST]]) -> (f32, f32) : i32 { +// UNROLL-BY-3-NEXT: %[[ADD0:.*]] = arith.addf %[[ARG0]], %[[ARG1]] : f32 +// UNROLL-BY-3-NEXT: %[[MUL0:.*]] = arith.mulf %[[ARG0]], %[[ARG1]] : f32 +// UNROLL-BY-3-NEXT: %[[ADD1:.*]] = arith.addf %[[ADD0]], %[[MUL0]] : f32 +// UNROLL-BY-3-NEXT: %[[MUL1:.*]] = arith.mulf %[[ADD0]], %[[MUL0]] : f32 +// UNROLL-BY-3-NEXT: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[MUL1]] : f32 +// UNROLL-BY-3-NEXT: %[[MUL2:.*]] = arith.mulf %[[ADD1]], %[[MUL1]] : f32 +// UNROLL-BY-3-NEXT: scf.yield %[[ADD2]], %[[MUL2]] : f32, f32 +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3: %[[EFOR:.*]]:2 = scf.for %[[EIV:.*]] = %[[C18]] to %[[C20]] step %[[C1]] +// UNROLL-BY-3-SAME: iter_args(%[[EARG0:.*]] = %[[FOR]]#0, %[[EARG1:.*]] = %[[FOR]]#1) -> (f32, f32) : i32 { +// UNROLL-BY-3-NEXT: %[[EADD:.*]] = arith.addf %[[EARG0]], %[[EARG1]] : f32 +// UNROLL-BY-3-NEXT: %[[EMUL:.*]] = arith.mulf %[[EARG0]], %[[EARG1]] : f32 +// UNROLL-BY-3-NEXT: scf.yield %[[EADD]], %[[EMUL]] : f32, f32 +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: return %[[EFOR]]#0, %[[EFOR]]#1 : f32, f32 From 115b87636b9f84cf145c0c96859f8e9f5e76c7a1 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 29 Aug 2024 09:28:45 -0700 Subject: [PATCH 12/72] [NFC][Support] Eliminate ',' at end of MemoryEffects print (#106545) - Eliminate comma at end of a MemoryEffects print. - Added basic unit test to validate that. --- llvm/lib/Support/ModRef.cpp | 7 ++++--- llvm/unittests/Support/CMakeLists.txt | 1 + llvm/unittests/Support/ModRefTest.cpp | 28 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 llvm/unittests/Support/ModRefTest.cpp diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index c5978296e97f0c..b57ea30b93832f 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ModRef.h" +#include "llvm/ADT/STLExtras.h" using namespace llvm; @@ -33,7 +34,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, ModRefInfo MR) { } raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { - for (IRMemLocation Loc : MemoryEffects::locations()) { + interleaveComma(MemoryEffects::locations(), OS, [&](IRMemLocation Loc) { switch (Loc) { case IRMemLocation::ArgMem: OS << "ArgMem: "; @@ -45,7 +46,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { OS << "Other: "; break; } - OS << ME.getModRef(Loc) << ", "; - } + OS << ME.getModRef(Loc); + }); return OS; } diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index db47a170e814a6..d64f89847aa8e7 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -61,6 +61,7 @@ add_llvm_unittest(SupportTests MemoryBufferRefTest.cpp MemoryBufferTest.cpp MemoryTest.cpp + ModRefTest.cpp NativeFormatTests.cpp OptimizedStructLayoutTest.cpp ParallelTest.cpp diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp new file mode 100644 index 00000000000000..5ebb5f6a41a586 --- /dev/null +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -0,0 +1,28 @@ +//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ModRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +namespace { + +// Verify that printing a MemoryEffects does not end with a ,. +TEST(ModRefTest, PrintMemoryEffects) { + std::string S; + raw_string_ostream OS(S); + OS << MemoryEffects::none(); + OS.flush(); + EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); +} + +} // namespace From 81acc84997d6d5d5c7f8e3b3e6d8d01d567d0e1c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 29 Aug 2024 15:58:36 +0100 Subject: [PATCH 13/72] [LoopVectorize][X86] amdlibm-calls.ll - add 2/4/8/16 vector widths test checks for fallback to llvm intrinsics Check for cases where there isn't a amdlib call but it still vectorises the math call --- .../LoopVectorize/X86/amdlibm-calls.ll | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll index 1ec6a4febf18a8..04289d43f40e2f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -80,6 +80,7 @@ define void @sin_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.sin.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -102,6 +103,7 @@ for.end: define void @sin_f32(ptr nocapture %varray) { ; CHECK-LABEL: @sin_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_sinf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]]) @@ -130,6 +132,7 @@ define void @sin_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.sin.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -152,6 +155,7 @@ for.end: define void @sin_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @sin_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_sinf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]]) @@ -180,6 +184,7 @@ define void @cos_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cos(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.cos.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -202,6 +207,7 @@ for.end: define void @cos_f32(ptr nocapture %varray) { ; CHECK-LABEL: @cos_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_cosf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]]) @@ -230,6 +236,7 @@ define void @cos_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cos(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.cos.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -252,6 +259,7 @@ for.end: define void @cos_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @cos_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_cosf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]]) @@ -280,6 +288,7 @@ define void @tan_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.tan.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -302,6 +311,7 @@ for.end: define void @tan_f32(ptr nocapture %varray) { ; CHECK-LABEL: @tan_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]]) @@ -330,6 +340,7 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.tan.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -352,6 +363,7 @@ for.end: define void @tan_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @tan_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]]) @@ -770,6 +782,7 @@ define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) { ; CHECK-VF2: [[TMP8:%.*]] = call <2 x double> @amd_vrd2_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[WIDE_LOAD:%.*]]) ; CHECK-VF4: [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK-VF8: [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]]) +; CHECK-VF16: [[TMP8:%.*]] = call <16 x double> @llvm.pow.v16f64(<16 x double> [[TMP4:%.*]], <16 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -797,6 +810,7 @@ define void @pow_f64_intrinsic(ptr nocapture %varray, ptr nocapture readonly %ex ; CHECK-VF2: [[TMP8:%.*]] = call <2 x double> @amd_vrd2_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[WIDE_LOAD:%.*]]) ; CHECK-VF4: [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK-VF8: [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]]) +; CHECK-VF16: [[TMP8:%.*]] = call <16 x double> @llvm.pow.v16f64(<16 x double> [[TMP4:%.*]], <16 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -821,6 +835,7 @@ for.end: define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32( +; CHECK-VF2: [[TMP8:%.*]] = call <2 x float> @llvm.pow.v2f32(<2 x float> [[TMP4:%.*]], <2 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF4: [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF8: [[TMP8:%.*]] = call <8 x float> @amd_vrs8_powf(<8 x float> [[TMP4:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF16: [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]]) @@ -848,6 +863,7 @@ for.end: define void @pow_f32_intrinsic(ptr nocapture %varray, ptr nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32_intrinsic( +; CHECK-VF2: [[TMP8:%.*]] = call <2 x float> @llvm.pow.v2f32(<2 x float> [[TMP4:%.*]], <2 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF4: [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF8: [[TMP8:%.*]] = call <8 x float> @amd_vrs8_powf(<8 x float> [[TMP4:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) ; CHECK-VF16: [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]]) @@ -878,6 +894,7 @@ define void @exp_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.exp.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -900,6 +917,7 @@ for.end: define void @exp_f32(ptr nocapture %varray) { ; CHECK-LABEL: @exp_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_expf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]]) @@ -928,6 +946,7 @@ define void @exp_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.exp.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -950,6 +969,7 @@ for.end: define void @exp_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @exp_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_expf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]]) @@ -978,6 +998,7 @@ define void @log_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1000,6 +1021,7 @@ for.end: define void @log_f32(ptr nocapture %varray) { ; CHECK-LABEL: @log_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_logf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]]) @@ -1028,6 +1050,7 @@ define void @log_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1050,6 +1073,7 @@ for.end: define void @log_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @log_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_logf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]]) @@ -1078,6 +1102,7 @@ define void @log2_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log2(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log2.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1100,6 +1125,7 @@ for.end: define void @log2_f32(ptr nocapture %varray) { ; CHECK-LABEL: @log2_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log2.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log2f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]]) @@ -1128,6 +1154,7 @@ define void @log2_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log2(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.log2.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1150,6 +1177,7 @@ for.end: define void @log2_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @log2_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log2.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log2f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]]) @@ -1175,6 +1203,7 @@ for.end: define void @log10_f32(ptr nocapture %varray) { ; CHECK-LABEL: @log10_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log10f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]]) @@ -1200,6 +1229,7 @@ for.end: define void @log10_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @log10_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.log10.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_log10f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]]) @@ -1228,6 +1258,7 @@ define void @exp2_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp2(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.exp2.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1250,6 +1281,7 @@ for.end: define void @exp2_f32(ptr nocapture %varray) { ; CHECK-LABEL: @exp2_f32( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp2f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]]) @@ -1278,6 +1310,7 @@ define void @exp2_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp2(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.exp2.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -1300,6 +1333,7 @@ for.end: define void @exp2_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @exp2_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp2f(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]]) From a777a93118a0ca71e19ac764a57a94f1be227dbb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 29 Aug 2024 17:24:05 +0100 Subject: [PATCH 14/72] Fix MSVC "not all control paths return a value" warning. NFC. --- llvm/utils/TableGen/IntrinsicEmitter.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 06b430ae011405..70ccecf7752af7 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -451,6 +451,7 @@ static AttributeSet getIntrinsicArgAttributeSet(LLVMContext &C, unsigned ID) { case CodeGenIntrinsic::Dereferenceable: return "Dereferenceable"; } + llvm_unreachable("Unknown CodeGenIntrinsic::ArgAttrKind enum"); }; OS << formatv(R"( From bd6531b9508624df83f84d9bc687a7339df452e9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 29 Aug 2024 09:45:19 -0700 Subject: [PATCH 15/72] [LTO] Introduce a new class ImportIDTable (#106503) The new class implements a deduplication table to convert import list elements: {SourceModule, GUID, Definition/Declaration} into 32-bit integers, and vice versa. This patch adds a unit test but does not add a use yet. To be precise, the deduplication table holds {SourceModule, GUID} pairs. We use the bottom one bit of the 32-bit integers to indicate whether we have a definition or declaration. A subsequent patch will collapse the import list hierarchy -- FunctionsToImportTy holding many instances of FunctionsToImportTy -- down to DenseSet with each element indexing into the deduplication table above. This will address multiple sources of space inefficiency. --- .../llvm/Transforms/IPO/FunctionImport.h | 71 +++++++++++++++ llvm/unittests/Transforms/IPO/CMakeLists.txt | 1 + .../Transforms/IPO/ImportIDTableTests.cpp | 91 +++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index b7280c56be9cc8..99fe110191dec9 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -10,6 +10,7 @@ #define LLVM_TRANSFORMS_IPO_FUNCTIONIMPORT_H #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/ModuleSummaryIndex.h" @@ -96,6 +97,76 @@ class FunctionImporter { std::tuple>>; + // Issues import IDs. Each ID uniquely corresponds to a tuple of + // (FromModule, GUID, Definition/Declaration). + // + // The import IDs make the import list space efficient by referring to each + // import with a 32-bit integer ID while maintaining a central table that maps + // those integer IDs to tuples of (FromModule, GUID, Def/Decl). + // + // In one large application, a pair of (FromModule, GUID) is mentioned in + // import lists more than 50 times on average across all destination modules. + // Mentioning the 32-byte tuple: + // + // std::tuple + // + // 50 times by value in various import lists would be costly. We can reduce + // the memory footprint of import lists by placing one copy in a central table + // and referring to it with 32-bit integer IDs. + // + // To save space within the central table, we only store pairs of + // (FromModule, GUID) in the central table. In the actual 32-bit integer ID, + // the top 31 bits index into the central table while the bottom 1 bit + // indicates whether an ID is for GlobalValueSummary::Declaration or + // GlobalValueSummary::Definition. + class ImportIDTable { + public: + using ImportIDTy = uint32_t; + + // Create a pair of import IDs [Def, Decl] for a given pair of FromModule + // and GUID. + std::pair createImportIDs(StringRef FromModule, + GlobalValue::GUID GUID) { + auto Key = std::make_pair(FromModule, GUID); + auto InsertResult = TheTable.try_emplace(Key, TheTable.size()); + return makeIDPair(InsertResult.first->second); + } + + // Get a pair of previously created import IDs [Def, Decl] for a given pair + // of FromModule and GUID. Returns std::nullopt if not available. + std::optional> + getImportIDs(StringRef FromModule, GlobalValue::GUID GUID) { + auto Key = std::make_pair(FromModule, GUID); + auto It = TheTable.find(Key); + if (It != TheTable.end()) + return makeIDPair(It->second); + return std::nullopt; + } + + // Return a tuple of [FromModule, GUID, Def/Decl] that a given ImportID + // corresponds to. + std::tuple + lookup(ImportIDTy ImportID) const { + GlobalValueSummary::ImportKind Kind = + (ImportID & 1) ? GlobalValueSummary::Declaration + : GlobalValueSummary::Definition; + auto It = TheTable.begin() + (ImportID >> 1); + StringRef FromModule = It->first.first; + GlobalValue::GUID GUID = It->first.second; + return std::make_tuple(FromModule, GUID, Kind); + } + + private: + // Make a pair of import IDs [Def, Decl] from an index into TheTable. + static std::pair makeIDPair(ImportIDTy Index) { + ImportIDTy Def = Index << 1; + ImportIDTy Decl = Def | 1; + return std::make_pair(Def, Decl); + } + + MapVector, ImportIDTy> TheTable; + }; + /// The map maintains the list of imports. Conceptually, it is a collection /// of tuples of the form: /// diff --git a/llvm/unittests/Transforms/IPO/CMakeLists.txt b/llvm/unittests/Transforms/IPO/CMakeLists.txt index ac632450d57305..65be2966bec049 100644 --- a/llvm/unittests/Transforms/IPO/CMakeLists.txt +++ b/llvm/unittests/Transforms/IPO/CMakeLists.txt @@ -13,4 +13,5 @@ add_llvm_unittest(IPOTests WholeProgramDevirt.cpp AttributorTest.cpp FunctionSpecializationTest.cpp + ImportIDTableTests.cpp ) diff --git a/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp b/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp new file mode 100644 index 00000000000000..09e0a25ec9b93a --- /dev/null +++ b/llvm/unittests/Transforms/IPO/ImportIDTableTests.cpp @@ -0,0 +1,91 @@ +//===- ImportIDTableTests.cpp - Unit tests for ImportIDTable --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/FunctionImport.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include +#include + +using namespace llvm; + +TEST(ImportIDTableTests, Basic) { + FunctionImporter::ImportIDTable Table; + + auto [Def, Decl] = Table.createImportIDs("mod", 123U); + auto [Def2, Decl2] = Table.createImportIDs("stuff", 456U); + + // Def and Decl must be of the same unsigned integer type. + static_assert( + std::is_unsigned_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // Check that all IDs are unique. + std::set IDs = {Def, Decl, Def2, + Decl2}; + EXPECT_THAT(IDs, ::testing::SizeIs(4)); + + // Verify what Def maps to. + auto DefTuple = Table.lookup(Def); + EXPECT_EQ(std::get<0>(DefTuple), StringRef("mod")); + EXPECT_EQ(std::get<1>(DefTuple), 123U); + EXPECT_EQ(std::get<2>(DefTuple), GlobalValueSummary::Definition); + + // Verify what Decl maps to. + auto DeclTuple = Table.lookup(Decl); + EXPECT_EQ(std::get<0>(DeclTuple), StringRef("mod")); + EXPECT_EQ(std::get<1>(DeclTuple), 123U); + EXPECT_EQ(std::get<2>(DeclTuple), GlobalValueSummary::Declaration); + + // Verify what Def2 maps to. + auto Def2Tuple = Table.lookup(Def2); + EXPECT_EQ(std::get<0>(Def2Tuple), StringRef("stuff")); + EXPECT_EQ(std::get<1>(Def2Tuple), 456U); + EXPECT_EQ(std::get<2>(Def2Tuple), GlobalValueSummary::Definition); + + // Verify what Decl2 maps to. + auto Decl2Tuple = Table.lookup(Decl2); + EXPECT_EQ(std::get<0>(Decl2Tuple), StringRef("stuff")); + EXPECT_EQ(std::get<1>(Decl2Tuple), 456U); + EXPECT_EQ(std::get<2>(Decl2Tuple), GlobalValueSummary::Declaration); +} + +TEST(ImportIDTableTests, Duplicates) { + FunctionImporter::ImportIDTable Table; + + auto [Def1, Decl1] = Table.createImportIDs("mod", 123U); + auto [Def2, Decl2] = Table.createImportIDs("mod", 123U); + + // Verify we get the same IDs back. + EXPECT_EQ(Def1, Def2); + EXPECT_EQ(Decl1, Decl2); +} + +TEST(ImportIDTableTests, Present) { + FunctionImporter::ImportIDTable Table; + + auto [Def, Decl] = Table.createImportIDs("mod", 123U); + auto Result = Table.getImportIDs("mod", 123U); + + // Verify that we get the same IDs back. + ASSERT_NE(Result, std::nullopt); + EXPECT_EQ(Result->first, Def); + EXPECT_EQ(Result->second, Decl); +} + +TEST(ImportIDTableTests, Missing) { + FunctionImporter::ImportIDTable Table; + + auto Result = Table.getImportIDs("mod", 123U); + + // Verify that we get std::nullopt for a non-existent pair. + EXPECT_EQ(Result, std::nullopt); +} From 59f05b683def5ef728baf8c4ae8f846e957ad67f Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 09:45:47 -0700 Subject: [PATCH 16/72] [RISCV][TTI] Model cost for insert/extract into illegal types (#106440) We'd previously just deferred to the base implementation, but that more or less always returns 1. This underestimates the cost of the insert/extract, biases the SLP vectorizer towards forming illegally typed vectors, and underestimates the cost of scalarized operations (like unaligned scatter/gather). --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 3 - .../CostModel/RISCV/fixed-vector-gather.ll | 36 +-- .../CostModel/RISCV/fixed-vector-scatter.ll | 36 +-- .../CostModel/RISCV/rvv-extractelement.ll | 266 +++++++++--------- .../CostModel/RISCV/rvv-insertelement.ll | 250 ++++++++-------- 5 files changed, 294 insertions(+), 297 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 537c62bb0aacd1..bb8e162f57dfcd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1626,9 +1626,6 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (LT.second.isScalableVector() && !LT.first.isValid()) return LT.first; - if (!isTypeLegal(Val)) - return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); - // Mask vector extract/insert is expanded via e8. if (Val->getScalarSizeInBits() == 1) { VectorType *WideTy = diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll index ec7eb81d98bf91..f37cd99e803ec9 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll @@ -42,34 +42,34 @@ define i32 @masked_gather() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll index 6da9d8d73cbd4c..ed15493c5e1e77 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll @@ -42,34 +42,34 @@ define i32 @masked_scatter() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 2, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 2, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 1, <1 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll index c9145784afcc1c..4a9e30888cdd1f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll @@ -61,7 +61,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_0 = extractelement undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_0 = extractelement undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_0 = extractelement undef, i32 0 -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = extractelement undef, i32 0 +; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_0 = extractelement undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1 @@ -85,7 +85,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1 @@ -97,7 +97,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1 @@ -107,7 +107,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1 @@ -115,7 +115,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x @@ -139,7 +139,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x @@ -151,7 +151,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x @@ -161,7 +161,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x @@ -169,7 +169,7 @@ define void @extractelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'extractelement_int' @@ -250,7 +250,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1 @@ -262,7 +262,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1 @@ -272,7 +272,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1 @@ -280,7 +280,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x @@ -304,7 +304,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x @@ -316,7 +316,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x @@ -326,7 +326,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x @@ -334,7 +334,7 @@ define void @extractelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'extractelement_int' @@ -387,11 +387,11 @@ define void @extractelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_0 = extractelement <16 x i64> undef, i32 0 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_0 = extractelement <16 x i64> undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_0 = extractelement undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_0 = extractelement undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_0 = extractelement undef, i32 0 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = extractelement undef, i32 0 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_0 = extractelement undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1 @@ -408,44 +408,44 @@ define void @extractelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = extractelement <16 x i8> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = extractelement <32 x i8> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = extractelement <64 x i8> undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = extractelement <16 x i16> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = extractelement <32 x i16> undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = extractelement <16 x i32> undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_1 = extractelement undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_1 = extractelement undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x @@ -462,44 +462,44 @@ define void @extractelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_x = extractelement <16 x i8> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_x = extractelement <32 x i8> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_x = extractelement <64 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_x = extractelement <16 x i16> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_x = extractelement <32 x i16> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_x = extractelement <16 x i32> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = extractelement undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = extractelement undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'extractelement_int' @@ -573,44 +573,44 @@ define void @extractelement_int(i32 %x) { ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = extractelement <16 x i8> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = extractelement <32 x i8> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = extractelement <64 x i8> undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = extractelement <128 x i8> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = extractelement undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = extractelement undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = extractelement <4 x i16> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = extractelement <8 x i16> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = extractelement <16 x i16> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = extractelement <32 x i16> undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = extractelement <64 x i16> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = extractelement undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = extractelement undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = extractelement <4 x i32> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = extractelement <8 x i32> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = extractelement <16 x i32> undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = extractelement <32 x i32> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = extractelement undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = extractelement undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = extractelement <4 x i64> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = extractelement <8 x i64> undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = extractelement <16 x i64> undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = extractelement undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = extractelement undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = extractelement undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x @@ -627,44 +627,44 @@ define void @extractelement_int(i32 %x) { ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_x = extractelement <16 x i8> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_x = extractelement <32 x i8> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_x = extractelement <64 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_x = extractelement <128 x i8> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_x = extractelement undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = extractelement undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_x = extractelement <2 x i16> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_x = extractelement <4 x i16> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_x = extractelement <8 x i16> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_x = extractelement <16 x i16> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_x = extractelement <32 x i16> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_x = extractelement <64 x i16> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_x = extractelement undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = extractelement undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_x = extractelement <2 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_x = extractelement <4 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_x = extractelement <8 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_x = extractelement <16 x i32> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_x = extractelement <32 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_x = extractelement undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = extractelement undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_x = extractelement <2 x i64> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_x = extractelement <4 x i64> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_x = extractelement <8 x i64> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = extractelement <16 x i64> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_x = extractelement undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = extractelement undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = extractelement undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i1_0 = extractelement <2 x i1> undef, i32 0 @@ -868,68 +868,68 @@ define void @extractelement_int_lmul(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'extractelement_int_lmul' ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'extractelement_int_lmul' -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'extractelement_int_lmul' -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = extractelement <128 x i8> undef, i32 31 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = extractelement <128 x i8> undef, i32 63 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = extractelement <128 x i8> undef, i32 127 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = extractelement <256 x i8> undef, i32 127 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = extractelement <256 x i8> undef, i32 255 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = extractelement <32 x i32> undef, i32 3 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = extractelement <32 x i32> undef, i32 7 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = extractelement <32 x i32> undef, i32 15 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = extractelement <32 x i32> undef, i32 31 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = extractelement <64 x i32> undef, i32 63 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8 = extractelement <128 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8 = extractelement <256 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32 = extractelement <32 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32 = extractelement <64 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v128i8_31 = extractelement <128 x i8> undef, i32 31 @@ -997,7 +997,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = extractelement <2 x float> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = extractelement <4 x float> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = extractelement <8 x float> undef, i32 1 @@ -1007,7 +1007,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = extractelement <4 x double> undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = extractelement <8 x double> undef, i32 1 @@ -1015,7 +1015,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = extractelement undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = extractelement undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = extractelement undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_x = extractelement <2 x half> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_x = extractelement <4 x half> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_x = extractelement <8 x half> undef, i32 %x @@ -1027,7 +1027,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_x = extractelement <2 x float> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_x = extractelement <4 x float> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_x = extractelement <8 x float> undef, i32 %x @@ -1037,7 +1037,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_x = extractelement <2 x double> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_x = extractelement <4 x double> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_x = extractelement <8 x double> undef, i32 %x @@ -1045,7 +1045,7 @@ define void @extractelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_x = extractelement undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = extractelement undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_x = extractelement undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'extractelement_fp' @@ -1090,7 +1090,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = extractelement <2 x float> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = extractelement <4 x float> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = extractelement <8 x float> undef, i32 1 @@ -1100,7 +1100,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = extractelement <4 x double> undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = extractelement <8 x double> undef, i32 1 @@ -1108,7 +1108,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = extractelement undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = extractelement undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = extractelement undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_x = extractelement <2 x half> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_x = extractelement <4 x half> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_x = extractelement <8 x half> undef, i32 %x @@ -1120,7 +1120,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_x = extractelement <2 x float> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_x = extractelement <4 x float> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_x = extractelement <8 x float> undef, i32 %x @@ -1130,7 +1130,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_x = extractelement <2 x double> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_x = extractelement <4 x double> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_x = extractelement <8 x double> undef, i32 %x @@ -1138,7 +1138,7 @@ define void @extractelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_x = extractelement undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = extractelement undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_x = extractelement undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'extractelement_fp' @@ -1441,50 +1441,50 @@ define void @extractelement_fp(i32 %x) { define void @extractelement_int_nonpoweroftwo(i32 %x) { ; RV32V-LABEL: 'extractelement_int_nonpoweroftwo' -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'extractelement_int_nonpoweroftwo' -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0 ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'extractelement_int_nonpoweroftwo' -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'extractelement_int_nonpoweroftwo' -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i8 = extractelement <3 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i8 = extractelement <7 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i8 = extractelement <15 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i8 = extractelement <31 x i8> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3i32 = extractelement <3 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v7i32 = extractelement <7 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15i32 = extractelement <15 x i32> undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v31i32 = extractelement <31 x i32> undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v31i32_0 = extractelement <31 x i32> undef, i32 0 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll index c167adf1f33977..0616e0919b9d9a 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll @@ -61,7 +61,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_0 = insertelement undef, i64 undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_0 = insertelement undef, i64 undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_0 = insertelement undef, i64 undef, i32 0 -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = insertelement undef, i64 undef, i32 0 +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_0 = insertelement undef, i64 undef, i32 0 ; RV32V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1 @@ -85,7 +85,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement undef, i8 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement undef, i8 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement undef, i8 undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1 @@ -97,7 +97,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement undef, i16 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement undef, i16 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement undef, i16 undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1 @@ -107,7 +107,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement undef, i32 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement undef, i32 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement undef, i32 undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1 @@ -115,7 +115,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_1 = insertelement undef, i64 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_1 = insertelement undef, i64 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_1 = insertelement undef, i64 undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x @@ -139,7 +139,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement undef, i8 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement undef, i8 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement undef, i8 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x @@ -151,7 +151,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement undef, i16 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement undef, i16 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement undef, i16 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x @@ -161,7 +161,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement undef, i32 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x @@ -169,7 +169,7 @@ define void @insertelement_int(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = insertelement undef, i64 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = insertelement undef, i64 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = insertelement undef, i64 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'insertelement_int' @@ -250,7 +250,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement undef, i8 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement undef, i8 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement undef, i8 undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1 @@ -262,7 +262,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement undef, i16 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement undef, i16 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement undef, i16 undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1 @@ -272,7 +272,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement undef, i32 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement undef, i32 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement undef, i32 undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1 @@ -280,7 +280,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = insertelement undef, i64 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = insertelement undef, i64 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = insertelement undef, i64 undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x @@ -304,7 +304,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement undef, i8 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement undef, i8 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement undef, i8 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x @@ -316,7 +316,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement undef, i16 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement undef, i16 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement undef, i16 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x @@ -326,7 +326,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement undef, i32 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x @@ -334,7 +334,7 @@ define void @insertelement_int(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_x = insertelement undef, i64 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_x = insertelement undef, i64 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_x = insertelement undef, i64 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'insertelement_int' @@ -387,11 +387,11 @@ define void @insertelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_0 = insertelement <16 x i64> undef, i64 undef, i32 0 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i64_0 = insertelement <16 x i64> undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_0 = insertelement undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_0 = insertelement undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_0 = insertelement undef, i64 undef, i32 0 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_0 = insertelement undef, i64 undef, i32 0 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_0 = insertelement undef, i64 undef, i32 0 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1 @@ -408,44 +408,44 @@ define void @insertelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = insertelement <16 x i8> undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = insertelement <32 x i8> undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = insertelement <64 x i8> undef, i8 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement undef, i8 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = insertelement <16 x i16> undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = insertelement <32 x i16> undef, i16 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = insertelement undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = insertelement undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement undef, i16 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = insertelement <16 x i32> undef, i32 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = insertelement undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement undef, i32 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_1 = insertelement undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_1 = insertelement undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_1 = insertelement undef, i64 undef, i32 1 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x @@ -462,44 +462,44 @@ define void @insertelement_int(i32 %x) { ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i8_x = insertelement <16 x i8> undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_x = insertelement <32 x i8> undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_x = insertelement <64 x i8> undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_x = insertelement <16 x i16> undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_x = insertelement <32 x i16> undef, i16 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_x = insertelement undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i16_x = insertelement undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement undef, i16 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_x = insertelement <16 x i32> undef, i32 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i32_x = insertelement undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement undef, i32 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv2i64_x = insertelement undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv4i64_x = insertelement undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8i64_x = insertelement undef, i64 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'insertelement_int' @@ -573,44 +573,44 @@ define void @insertelement_int(i32 %x) { ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_1 = insertelement <16 x i8> undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_1 = insertelement <32 x i8> undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_1 = insertelement <64 x i8> undef, i8 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_1 = insertelement <128 x i8> undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i8_1 = insertelement undef, i8 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv128i8_1 = insertelement undef, i8 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_1 = insertelement <4 x i16> undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_1 = insertelement <8 x i16> undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = insertelement <16 x i16> undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = insertelement <32 x i16> undef, i16 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i16_1 = insertelement <64 x i16> undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_1 = insertelement undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_1 = insertelement undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_1 = insertelement undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_1 = insertelement undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i16_1 = insertelement undef, i16 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_1 = insertelement undef, i16 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_1 = insertelement <4 x i32> undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = insertelement <8 x i32> undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = insertelement <16 x i32> undef, i32 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_1 = insertelement <32 x i32> undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_1 = insertelement undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_1 = insertelement undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_1 = insertelement undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i32_1 = insertelement undef, i32 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_1 = insertelement undef, i32 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = insertelement <4 x i64> undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_1 = insertelement <8 x i64> undef, i64 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_1 = insertelement <16 x i64> undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_1 = insertelement undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_1 = insertelement undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_1 = insertelement undef, i64 undef, i32 1 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_1 = insertelement undef, i64 undef, i32 1 ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x @@ -627,44 +627,44 @@ define void @insertelement_int(i32 %x) { ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i8_x = insertelement <16 x i8> undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_x = insertelement <32 x i8> undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_x = insertelement <64 x i8> undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_x = insertelement <128 x i8> undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_x = insertelement undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv128i8_x = insertelement undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i16_x = insertelement <2 x i16> undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_x = insertelement <4 x i16> undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16_x = insertelement <8 x i16> undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_x = insertelement <16 x i16> undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_x = insertelement <32 x i16> undef, i16 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_x = insertelement <64 x i16> undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_x = insertelement undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i16_x = insertelement undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i16_x = insertelement undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_x = insertelement undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_x = insertelement undef, i16 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i16_x = insertelement undef, i16 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i32_x = insertelement <2 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_x = insertelement <4 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i32_x = insertelement <8 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i32_x = insertelement <16 x i32> undef, i32 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_x = insertelement <32 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i32_x = insertelement undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i32_x = insertelement undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_x = insertelement undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_x = insertelement undef, i32 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_x = insertelement undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_x = insertelement <2 x i64> undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i64_x = insertelement <4 x i64> undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64_x = insertelement <8 x i64> undef, i64 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i64_x = insertelement <16 x i64> undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_x = insertelement undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_x = insertelement undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i64_x = insertelement undef, i64 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_x = insertelement undef, i64 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0 @@ -868,68 +868,68 @@ define void @insertelement_int_lmul(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'insertelement_int_lmul' ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'insertelement_int_lmul' -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'insertelement_int_lmul' -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_63 = insertelement <128 x i8> undef, i8 undef, i32 63 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i8_127 = insertelement <128 x i8> undef, i8 undef, i32 127 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_127 = insertelement <256 x i8> undef, i8 undef, i32 127 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v256i8_255 = insertelement <256 x i8> undef, i8 undef, i32 255 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_3 = insertelement <32 x i32> undef, i32 undef, i32 3 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_7 = insertelement <32 x i32> undef, i32 undef, i32 7 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_15 = insertelement <32 x i32> undef, i32 undef, i32 15 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i32_31 = insertelement <32 x i32> undef, i32 undef, i32 31 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_63 = insertelement <64 x i32> undef, i32 undef, i32 63 +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8 = insertelement <128 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v256i8 = insertelement <256 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32 = insertelement <32 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32 = insertelement <64 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v128i8_31 = insertelement <128 x i8> undef, i8 undef, i32 31 @@ -997,7 +997,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = insertelement undef, half undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = insertelement undef, half undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = insertelement undef, half undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = insertelement undef, half undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = insertelement undef, half undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = insertelement <4 x float> undef, float undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = insertelement <8 x float> undef, float undef, i32 1 @@ -1007,7 +1007,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = insertelement undef, float undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = insertelement undef, float undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = insertelement undef, float undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = insertelement undef, float undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = insertelement undef, float undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = insertelement <4 x double> undef, double undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = insertelement <8 x double> undef, double undef, i32 1 @@ -1015,7 +1015,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = insertelement undef, double undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = insertelement undef, double undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = insertelement undef, double undef, i32 1 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = insertelement undef, double undef, i32 1 +; RV32V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = insertelement undef, double undef, i32 1 ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_x = insertelement <2 x half> undef, half undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_x = insertelement <4 x half> undef, half undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_x = insertelement <8 x half> undef, half undef, i32 %x @@ -1027,7 +1027,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_x = insertelement undef, half undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_x = insertelement undef, half undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_x = insertelement undef, half undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = insertelement undef, half undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_x = insertelement undef, half undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_x = insertelement <2 x float> undef, float undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_x = insertelement <4 x float> undef, float undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_x = insertelement <8 x float> undef, float undef, i32 %x @@ -1037,7 +1037,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_x = insertelement undef, float undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_x = insertelement undef, float undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_x = insertelement undef, float undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = insertelement undef, float undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_x = insertelement undef, float undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_x = insertelement <2 x double> undef, double undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_x = insertelement <4 x double> undef, double undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_x = insertelement <8 x double> undef, double undef, i32 %x @@ -1045,7 +1045,7 @@ define void @insertelement_fp(i32 %x) { ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_x = insertelement undef, double undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_x = insertelement undef, double undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_x = insertelement undef, double undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = insertelement undef, double undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_x = insertelement undef, double undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'insertelement_fp' @@ -1090,7 +1090,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_1 = insertelement undef, half undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16_1 = insertelement undef, half undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f16_1 = insertelement undef, half undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_1 = insertelement undef, half undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_1 = insertelement undef, half undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_1 = insertelement <4 x float> undef, float undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_1 = insertelement <8 x float> undef, float undef, i32 1 @@ -1100,7 +1100,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_1 = insertelement undef, float undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_1 = insertelement undef, float undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_1 = insertelement undef, float undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_1 = insertelement undef, float undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_1 = insertelement undef, float undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_1 = insertelement <4 x double> undef, double undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_1 = insertelement <8 x double> undef, double undef, i32 1 @@ -1108,7 +1108,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_1 = insertelement undef, double undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_1 = insertelement undef, double undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_1 = insertelement undef, double undef, i32 1 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_1 = insertelement undef, double undef, i32 1 +; RV64V-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_1 = insertelement undef, double undef, i32 1 ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_x = insertelement <2 x half> undef, half undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_x = insertelement <4 x half> undef, half undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_x = insertelement <8 x half> undef, half undef, i32 %x @@ -1120,7 +1120,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_x = insertelement undef, half undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_x = insertelement undef, half undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_x = insertelement undef, half undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv64f16_x = insertelement undef, half undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_x = insertelement undef, half undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_x = insertelement <2 x float> undef, float undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_x = insertelement <4 x float> undef, float undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_x = insertelement <8 x float> undef, float undef, i32 %x @@ -1130,7 +1130,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_x = insertelement undef, float undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_x = insertelement undef, float undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_x = insertelement undef, float undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f32_x = insertelement undef, float undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_x = insertelement undef, float undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_x = insertelement <2 x double> undef, double undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_x = insertelement <4 x double> undef, double undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_x = insertelement <8 x double> undef, double undef, i32 %x @@ -1138,7 +1138,7 @@ define void @insertelement_fp(i32 %x) { ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_x = insertelement undef, double undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_x = insertelement undef, double undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_x = insertelement undef, double undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f64_x = insertelement undef, double undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_x = insertelement undef, double undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'insertelement_fp' @@ -1441,43 +1441,43 @@ define void @insertelement_fp(i32 %x) { define void @insertelement_int_nonpoweroftwo(i32 %x) { ; RV32V-LABEL: 'insertelement_int_nonpoweroftwo' -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0 -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x -; RV32V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x +; RV32V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x ; RV32V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64V-LABEL: 'insertelement_int_nonpoweroftwo' -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0 -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x -; RV64V-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x +; RV64V-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x ; RV64V-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV32ZVE64X-LABEL: 'insertelement_int_nonpoweroftwo' -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0 -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x -; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x +; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x ; RV32ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64ZVE64X-LABEL: 'insertelement_int_nonpoweroftwo' -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i8 = insertelement <7 x i8> undef, i8 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i8 = insertelement <15 x i8> undef, i8 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32_0 = insertelement <3 x i32> undef, i32 undef, i32 0 -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x -; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3i32 = insertelement <3 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v7i32 = insertelement <7 x i32> undef, i32 undef, i32 %x +; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v15i32 = insertelement <15 x i32> undef, i32 undef, i32 %x ; RV64ZVE64X-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v3i8 = insertelement <3 x i8> undef, i8 undef, i32 %x From e5e38ddf1b8043324175868831da21e941c00aff Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Thu, 29 Aug 2024 09:50:44 -0700 Subject: [PATCH 17/72] [AArch64] Make apple-m4 armv8.7-a again (from armv9.2-a). (#106312) This is a partial revert of c66e1d6f3429. Even though that allowed us to declare v9.2-a support without picking up SVE2 in both the backend and the driver, the frontend itself still enabled SVE via the arch version's default extensions. Avoid that by reverting back to v8.7-a while we look into longer-term solutions. --- clang/test/CodeGen/aarch64-targetattr.c | 9 +++++++++ llvm/lib/Target/AArch64/AArch64Processors.td | 7 ++++++- llvm/unittests/TargetParser/TargetParserTest.cpp | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c index d6227be2ebef83..1bc78a6e1f8c0f 100644 --- a/clang/test/CodeGen/aarch64-targetattr.c +++ b/clang/test/CodeGen/aarch64-targetattr.c @@ -191,6 +191,14 @@ __attribute__((target("no-v9.3a"))) // void minusarch() {} +__attribute__((target("cpu=apple-m4"))) +// CHECK-LABEL: define {{[^@]+}}@applem4 +// CHECK-SAME: () #[[ATTR18:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +void applem4() {} + //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } @@ -210,6 +218,7 @@ void minusarch() {} // CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "branch-target-enforcement" "guarded-control-stack" "no-trapping-math"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } // CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.3a" } +// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m4" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+sme,+sme-f64f64,+sme-i16i64,+sme2,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8a,+wfxt" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 84d8cae3a0a5d1..1d6c71cbbf0ec3 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -895,7 +895,12 @@ def ProcessorFeatures { FeatureLSE, FeaturePAuth, FeatureFPAC, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureBF16, FeatureDotProd, FeatureMatMulInt8, FeatureSSBS]; - list AppleM4 = [HasV9_2aOps, FeatureSHA2, FeatureFPARMv8, + // Technically apple-m4 is v9.2a, but we can't use that here. + // Historically, llvm defined v9.0a as requiring SVE, but it's optional + // according to the Arm ARM, and not supported by the core. We decoupled the + // two in the clang driver and in the backend subtarget features, but it's + // still an issue in the clang frontend. v8.7a is the next closest choice. + list AppleM4 = [HasV8_7aOps, FeatureSHA2, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, FeatureAES, FeatureBF16, diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 7d999b826252a2..13db80ab5c68ea 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1122,7 +1122,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("apple-a16", "armv8.6-a"), AArch64CPUTestParams("apple-m3", "armv8.6-a"), AArch64CPUTestParams("apple-a17", "armv8.6-a"), - AArch64CPUTestParams("apple-m4", "armv9.2-a"), + AArch64CPUTestParams("apple-m4", "armv8.7-a"), AArch64CPUTestParams("exynos-m3", "armv8-a"), AArch64CPUTestParams("exynos-m4", "armv8.2-a"), AArch64CPUTestParams("exynos-m5", "armv8.2-a"), From 3d08ade7bd32f0296e0ca3a13640cc95fa89229a Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 29 Aug 2024 17:53:32 +0100 Subject: [PATCH 18/72] [ExtendLifetimes] Implement llvm.fake.use to extend variable lifetimes (#86149) This patch is part of a set of patches that add an `-fextend-lifetimes` flag to clang, which extends the lifetimes of local variables and parameters for improved debuggability. In addition to that flag, the patch series adds a pragma to selectively disable `-fextend-lifetimes`, and an `-fextend-this-ptr` flag which functions as `-fextend-lifetimes` for this pointers only. All changes and tests in these patches were written by Wolfgang Pieb (@wolfy1961), while Stephen Tozer (@SLTozer) has handled review and merging. The extend lifetimes flag is intended to eventually be set on by `-Og`, as discussed in the RFC here: https://discourse.llvm.org/t/rfc-redefine-og-o1-and-add-a-new-level-of-og/72850 This patch implements a new intrinsic instruction in LLVM, `llvm.fake.use` in IR and `FAKE_USE` in MIR, that takes a single operand and has no effect other than "using" its operand, to ensure that its operand remains live until after the fake use. This patch does not emit fake uses anywhere; the next patch in this sequence causes them to be emitted from the clang frontend, such that for each variable (or this) a fake.use operand is inserted at the end of that variable's scope, using that variable's value. This patch covers everything post-frontend, which is largely just the basic plumbing for a new intrinsic/instruction, along with a few steps to preserve the fake uses through optimizations (such as moving them ahead of a tail call or translating them through SROA). Co-authored-by: Stephen Tozer --- llvm/docs/LangRef.rst | 36 ++++ llvm/include/llvm/Analysis/PtrUseVisitor.h | 6 + llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 + llvm/include/llvm/CodeGen/MachineInstr.h | 2 + llvm/include/llvm/CodeGen/Passes.h | 3 + llvm/include/llvm/CodeGen/SelectionDAGISel.h | 1 + llvm/include/llvm/IR/Intrinsics.td | 3 + llvm/include/llvm/InitializePasses.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/include/llvm/Support/TargetOpcodes.def | 3 + llvm/include/llvm/Target/Target.td | 10 ++ llvm/lib/CodeGen/Analysis.cpp | 3 +- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 19 ++ llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/CodeGen.cpp | 1 + llvm/lib/CodeGen/CodeGenPrepare.cpp | 44 ++++- .../CodeGen/DeadMachineInstructionElim.cpp | 3 +- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 8 + llvm/lib/CodeGen/GlobalISel/Utils.cpp | 3 + llvm/lib/CodeGen/MachineCSE.cpp | 3 +- llvm/lib/CodeGen/MachineScheduler.cpp | 3 +- llvm/lib/CodeGen/MachineSink.cpp | 2 +- llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp | 162 ++++++++++++++++++ llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 3 + .../SelectionDAG/LegalizeFloatTypes.cpp | 20 +++ .../SelectionDAG/LegalizeIntegerTypes.cpp | 19 ++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 7 + .../SelectionDAG/LegalizeTypesGeneric.cpp | 11 ++ .../SelectionDAG/LegalizeVectorTypes.cpp | 36 ++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 33 ++++ .../SelectionDAG/SelectionDAGDumper.cpp | 2 + .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 64 +++++++ llvm/lib/CodeGen/TargetPassConfig.cpp | 1 + llvm/lib/IR/Instruction.cpp | 5 +- llvm/lib/IR/Verifier.cpp | 1 + llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 1 + llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp | 1 + .../WebAssembly/WebAssemblyTargetMachine.cpp | 1 + llvm/lib/Target/X86/X86FloatingPoint.cpp | 32 ++++ llvm/lib/Transforms/Scalar/SROA.cpp | 30 ++++ llvm/lib/Transforms/Utils/CloneFunction.cpp | 6 + llvm/lib/Transforms/Utils/Local.cpp | 3 + .../Utils/PromoteMemoryToRegister.cpp | 3 +- .../ScalarEvolution/flags-from-poison-dbg.ll | 2 +- llvm/test/CodeGen/AArch64/O0-pipeline.ll | 1 + llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + llvm/test/CodeGen/ARM/O3-pipeline.ll | 1 + llvm/test/CodeGen/LoongArch/O0-pipeline.ll | 1 + llvm/test/CodeGen/LoongArch/opt-pipeline.ll | 1 + .../CodeGen/MIR/X86/fake-use-tailcall.mir | 99 +++++++++++ llvm/test/CodeGen/PowerPC/O0-pipeline.ll | 1 + llvm/test/CodeGen/PowerPC/O3-pipeline.ll | 1 + llvm/test/CodeGen/RISCV/O0-pipeline.ll | 1 + llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 + llvm/test/CodeGen/X86/O0-pipeline.ll | 1 + llvm/test/CodeGen/X86/fake-use-hpfloat.ll | 15 ++ llvm/test/CodeGen/X86/fake-use-ld.ll | 43 +++++ llvm/test/CodeGen/X86/fake-use-scheduler.mir | 123 +++++++++++++ .../CodeGen/X86/fake-use-simple-tail-call.ll | 24 +++ .../CodeGen/X86/fake-use-suppress-load.ll | 14 ++ llvm/test/CodeGen/X86/fake-use-tailcall.ll | 37 ++++ llvm/test/CodeGen/X86/fake-use-vector.ll | 39 +++++ llvm/test/CodeGen/X86/fake-use-vector2.ll | 27 +++ llvm/test/CodeGen/X86/fake-use-zero-length.ll | 30 ++++ llvm/test/CodeGen/X86/opt-pipeline.ll | 1 + .../DebugInfo/AArch64/fake-use-global-isel.ll | 98 +++++++++++ llvm/test/DebugInfo/Inputs/check-fake-use.py | 107 ++++++++++++ llvm/test/DebugInfo/X86/fake-use.ll | 96 +++++++++++ .../GlobalISelCombinerEmitter/match-table.td | 54 +++--- .../CodeGenPrepare/X86/fake-use-phi.ll | 50 ++++++ .../CodeGenPrepare/X86/fake-use-split-ret.ll | 37 ++++ .../test/Transforms/GVN/fake-use-constprop.ll | 60 +++++++ llvm/test/Transforms/SROA/fake-use-escape.ll | 21 +++ llvm/test/Transforms/SROA/fake-use-sroa.ll | 52 ++++++ .../gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 + 76 files changed, 1609 insertions(+), 38 deletions(-) create mode 100644 llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp create mode 100644 llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir create mode 100644 llvm/test/CodeGen/X86/fake-use-hpfloat.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-ld.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-scheduler.mir create mode 100644 llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-suppress-load.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-tailcall.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-vector.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-vector2.ll create mode 100644 llvm/test/CodeGen/X86/fake-use-zero-length.ll create mode 100644 llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll create mode 100644 llvm/test/DebugInfo/Inputs/check-fake-use.py create mode 100644 llvm/test/DebugInfo/X86/fake-use.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll create mode 100644 llvm/test/Transforms/GVN/fake-use-constprop.ll create mode 100644 llvm/test/Transforms/SROA/fake-use-escape.ll create mode 100644 llvm/test/Transforms/SROA/fake-use-sroa.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 8c696cb16e77f8..cf0a6f96fb012e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -29477,6 +29477,42 @@ execution, but is unknown at compile time. If the result value does not fit in the result type, then the result is a :ref:`poison value `. +.. _llvm_fake_use: + +'``llvm.fake.use``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare void @llvm.fake.use(...) + +Overview: +""""""""" + +The ``llvm.fake.use`` intrinsic is a no-op. It takes a single +value as an operand and is treated as a use of that operand, to force the +optimizer to preserve that value prior to the fake use. This is used for +extending the lifetimes of variables, where this intrinsic placed at the end of +a variable's scope helps prevent that variable from being optimized out. + +Arguments: +"""""""""" + +The ``llvm.fake.use`` intrinsic takes one argument, which may be any +function-local SSA value. Note that the signature is variadic so that the +intrinsic can take any type of argument, but passing more than one argument will +result in an error. + +Semantics: +"""""""""" + +This intrinsic does nothing, but optimizers must consider it a use of its single +operand and should try to preserve the intrinsic and its position in the +function. + Stack Map Intrinsics -------------------- diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h index b6cc14d2077af0..f5c23b1b4e014d 100644 --- a/llvm/include/llvm/Analysis/PtrUseVisitor.h +++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h @@ -278,6 +278,12 @@ class PtrUseVisitor : protected InstVisitor, default: return Base::visitIntrinsicInst(II); + // We escape pointers used by a fake_use to prevent SROA from transforming + // them. + case Intrinsic::fake_use: + PI.setEscaped(&II); + return; + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: return; // No-op intrinsics. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 86ff2628975942..187d624f0a73b9 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1372,6 +1372,11 @@ enum NodeType { LIFETIME_START, LIFETIME_END, + /// FAKE_USE represents a use of the operand but does not do anything. + /// Its purpose is the extension of the operand's lifetime mainly for + /// debugging purposes. + FAKE_USE, + /// GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the /// beginning and end of GC transition sequence, and carry arbitrary /// information that target might need for lowering. The first operand is diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 04c8144f2fe7af..62667cc8ef3800 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1435,6 +1435,8 @@ class MachineInstr return getOpcode() == TargetOpcode::EXTRACT_SUBREG; } + bool isFakeUse() const { return getOpcode() == TargetOpcode::FAKE_USE; } + /// Return true if the instruction behaves like a copy. /// This does not include native copy instructions. bool isCopyLike() const { diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index c7c2178571215b..dbdd110b0600e5 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -440,6 +440,9 @@ namespace llvm { // metadata after llvm SanitizerBinaryMetadata pass. extern char &MachineSanitizerBinaryMetadataID; + /// RemoveLoadsIntoFakeUses pass. + extern char &RemoveLoadsIntoFakeUsesID; + /// RemoveRedundantDebugValues pass. extern char &RemoveRedundantDebugValuesID; diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index fc0590b1a1b69e..f6191c6fdb7fe6 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -463,6 +463,7 @@ class SelectionDAGISel { void Select_READ_REGISTER(SDNode *Op); void Select_WRITE_REGISTER(SDNode *Op); void Select_UNDEF(SDNode *N); + void Select_FAKE_USE(SDNode *N); void CannotYetSelect(SDNode *N); void Select_FREEZE(SDNode *N); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index e3bf0446575ae5..232d6be1073f49 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1835,6 +1835,9 @@ def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem, IntrWillReturn, IntrConvergent], "llvm.is.constant">; +// Introduce a use of the argument without generating any code. +def int_fake_use : Intrinsic<[], [llvm_vararg_ty]>; + // Intrinsic to mask out bits of a pointer. // First argument must be pointer or vector of pointer. This is checked by the // verifier. diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index cc5e93c58f564a..47a1ca15fc0d1f 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -264,6 +264,7 @@ void initializeRegionOnlyViewerPass(PassRegistry &); void initializeRegionPrinterPass(PassRegistry &); void initializeRegionViewerPass(PassRegistry &); void initializeRegisterCoalescerPass(PassRegistry &); +void initializeRemoveLoadsIntoFakeUsesPass(PassRegistry &); void initializeRemoveRedundantDebugValuesPass(PassRegistry &); void initializeRenameIndependentSubregsPass(PassRegistry &); void initializeReplaceWithVeclibLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 05baf514fa7210..b710b1c46f643f 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -250,6 +250,7 @@ DUMMY_MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass) DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass) DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass) +DUMMY_MACHINE_FUNCTION_PASS("remove-loads-into-fake-uses", RemoveLoadsIntoFakeUsesPass) DUMMY_MACHINE_FUNCTION_PASS("removeredundantdebugvalues", RemoveRedundantDebugValuesPass) DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass) DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass) diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 9fb6de49fb2055..635c265a433631 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -217,6 +217,9 @@ HANDLE_TARGET_OPCODE(PATCHABLE_TYPED_EVENT_CALL) HANDLE_TARGET_OPCODE(ICALL_BRANCH_FUNNEL) +/// Represents a use of the operand but generates no code. +HANDLE_TARGET_OPCODE(FAKE_USE) + // This is a fence with the singlethread scope. It represents a compiler memory // barrier, but does not correspond to any generated instruction. HANDLE_TARGET_OPCODE(MEMBARRIER) diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 34332386085870..b2eb250ae60b60 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1418,6 +1418,16 @@ def FAULTING_OP : StandardPseudoInstruction { let isTerminator = true; let isBranch = true; } +def FAKE_USE : StandardPseudoInstruction { + // An instruction that uses its operands but does nothing; this instruction + // will be treated specially by CodeGen passes, distinguishing it from any + // otherwise equivalent instructions. + let OutOperandList = (outs); + let InOperandList = (ins variable_ops); + let AsmString = "FAKE_USE"; + let hasSideEffects = 0; + let isMeta = true; +} def PATCHABLE_OP : StandardPseudoInstruction { let OutOperandList = (outs); let InOperandList = (ins variable_ops); diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 128060ec912c76..f77b733c6c8f69 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -567,7 +567,8 @@ bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM, if (const IntrinsicInst *II = dyn_cast(BBI)) if (II->getIntrinsicID() == Intrinsic::lifetime_end || II->getIntrinsicID() == Intrinsic::assume || - II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) + II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl || + II->getIntrinsicID() == Intrinsic::fake_use) continue; if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() || !isSafeToSpeculativelyExecute(&*BBI)) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 60cb26973ead41..19d23c8ba96783 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1131,6 +1131,21 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) { AP.OutStreamer->addBlankLine(); } +static void emitFakeUse(const MachineInstr *MI, AsmPrinter &AP) { + std::string Str; + raw_string_ostream OS(Str); + OS << "fake_use:"; + for (const MachineOperand &Op : MI->operands()) { + // In some circumstances we can end up with fake uses of constants; skip + // these. + if (!Op.isReg()) + continue; + OS << ' ' << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo()); + } + AP.OutStreamer->AddComment(OS.str()); + AP.OutStreamer->addBlankLine(); +} + /// emitDebugValueComment - This method handles the target-independent form /// of DBG_VALUE, returning true if it was able to do so. A false return /// means the target will need to handle MI in EmitInstruction. @@ -1799,6 +1814,10 @@ void AsmPrinter::emitFunctionBody() { case TargetOpcode::KILL: if (isVerbose()) emitKill(&MI, *this); break; + case TargetOpcode::FAKE_USE: + if (isVerbose()) + emitFakeUse(&MI, *this); + break; case TargetOpcode::PSEUDO_PROBE: emitPseudoProbe(MI); break; diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index f1607f85c5b319..ae12ce1170f703 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -200,6 +200,7 @@ add_llvm_component_library(LLVMCodeGen RegisterUsageInfo.cpp RegUsageInfoCollector.cpp RegUsageInfoPropagate.cpp + RemoveLoadsIntoFakeUses.cpp ReplaceWithVeclib.cpp ResetMachineFunctionPass.cpp RegisterBank.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 31fa4c105cef80..177702054a0e31 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -116,6 +116,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRegUsageInfoCollectorPass(Registry); initializeRegUsageInfoPropagationPass(Registry); initializeRegisterCoalescerPass(Registry); + initializeRemoveLoadsIntoFakeUsesPass(Registry); initializeRemoveRedundantDebugValuesPass(Registry); initializeRenameIndependentSubregsPass(Registry); initializeSafeStackLegacyPassPass(Registry); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index da6c758d53d487..271a047fc6a7b8 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2800,12 +2800,34 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, return false; }; + SmallVector FakeUses; + + auto isFakeUse = [&FakeUses](const Instruction *Inst) { + if (auto *II = dyn_cast(Inst); + II && II->getIntrinsicID() == Intrinsic::fake_use) { + // Record the instruction so it can be preserved when the exit block is + // removed. Do not preserve the fake use that uses the result of the + // PHI instruction. + // Do not copy fake uses that use the result of a PHI node. + // FIXME: If we do want to copy the fake use into the return blocks, we + // have to figure out which of the PHI node operands to use for each + // copy. + if (!isa(II->getOperand(0))) { + FakeUses.push_back(II); + } + return true; + } + + return false; + }; + // Make sure there are no instructions between the first instruction // and return. const Instruction *BI = BB->getFirstNonPHI(); // Skip over debug and the bitcast. while (isa(BI) || BI == BCI || BI == EVI || - isa(BI) || isLifetimeEndOrBitCastFor(BI)) + isa(BI) || isLifetimeEndOrBitCastFor(BI) || + isFakeUse(BI)) BI = BI->getNextNode(); if (BI != RetI) return false; @@ -2814,6 +2836,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, /// call. const Function *F = BB->getParent(); SmallVector TailCallBBs; + // Record the call instructions so we can insert any fake uses + // that need to be preserved before them. + SmallVector CallInsts; if (PN) { for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { // Look through bitcasts. @@ -2825,6 +2850,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, TLI->mayBeEmittedAsTailCall(CI) && attributesPermitTailCall(F, CI, RetI, *TLI)) { TailCallBBs.push_back(PredBB); + CallInsts.push_back(CI); } else { // Consider the cases in which the phi value is indirectly produced by // the tail call, for example when encountering memset(), memmove(), @@ -2844,8 +2870,10 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && IncomingVal == CI->getArgOperand(0) && TLI->mayBeEmittedAsTailCall(CI) && - attributesPermitTailCall(F, CI, RetI, *TLI)) + attributesPermitTailCall(F, CI, RetI, *TLI)) { TailCallBBs.push_back(PredBB); + CallInsts.push_back(CI); + } } } } else { @@ -2863,6 +2891,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && V == CI->getArgOperand(0))) { TailCallBBs.push_back(Pred); + CallInsts.push_back(CI); } } } @@ -2889,8 +2918,17 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, } // If we eliminated all predecessors of the block, delete the block now. - if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) + if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) { + // Copy the fake uses found in the original return block to all blocks + // that contain tail calls. + for (auto *CI : CallInsts) { + for (auto const *FakeUse : FakeUses) { + auto *ClonedInst = FakeUse->clone(); + ClonedInst->insertBefore(CI); + } + } BB->eraseFromParent(); + } return Changed; } diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index 7fc25cd889a0df..332ed37bd2b79f 100644 --- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -87,7 +87,8 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const { return false; // Don't delete frame allocation labels. - if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE) + if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE || + MI->getOpcode() == TargetOpcode::FAKE_USE) return false; // Don't delete instructions with side effects. diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index f44af78cded46d..968d0a2a5c75e4 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2193,6 +2193,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, } return true; } + case Intrinsic::fake_use: { + SmallVector VRegs; + for (const auto &Arg : CI.args()) + for (auto VReg : getOrCreateVRegs(*Arg)) + VRegs.push_back(VReg); + MIRBuilder.buildInstr(TargetOpcode::FAKE_USE, std::nullopt, VRegs); + return true; + } case Intrinsic::dbg_declare: { const DbgDeclareInst &DI = cast(CI); assert(DI.getVariable() && "Missing variable"); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index cfdd9905c16fa6..b1270e7aeb875c 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -228,6 +228,9 @@ bool llvm::isTriviallyDead(const MachineInstr &MI, // Don't delete frame allocation labels. if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) return false; + // Don't delete fake uses. + if (MI.getOpcode() == TargetOpcode::FAKE_USE) + return false; // LIFETIME markers should be preserved even if they seem dead. if (MI.getOpcode() == TargetOpcode::LIFETIME_START || MI.getOpcode() == TargetOpcode::LIFETIME_END) diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 27bbf5599b6046..aadc54b495fe22 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -406,7 +406,8 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, bool MachineCSE::isCSECandidate(MachineInstr *MI) { if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || - MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo()) + MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo() || + MI->isFakeUse()) return false; // Ignore copies. diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 7a3cf96ccffe0a..4e6d34346b1d80 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -530,7 +530,8 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB, MachineFunction *MF, const TargetInstrInfo *TII) { - return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF); + return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF) || + MI->isFakeUse(); } /// A region of an MBB for scheduling. diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index fe515ef5be541f..609f9af9767f5d 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -833,7 +833,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) { if (!ProcessedBegin) --I; - if (MI.isDebugOrPseudoInstr()) { + if (MI.isDebugOrPseudoInstr() || MI.isFakeUse()) { if (MI.isDebugValue()) ProcessDbgInst(MI); continue; diff --git a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp new file mode 100644 index 00000000000000..232181a199b8c2 --- /dev/null +++ b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp @@ -0,0 +1,162 @@ +//===---- RemoveLoadsIntoFakeUses.cpp - Remove loads with no real uses ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// The FAKE_USE instruction is used to preserve certain values through +/// optimizations for the sake of debugging. This may result in spilled values +/// being loaded into registers that are only used by FAKE_USEs; this is not +/// necessary for debugging purposes, because at that point the value must be on +/// the stack and hence available for debugging. Therefore, this pass removes +/// loads that are only used by FAKE_USEs. +/// +/// This pass should run very late, to ensure that we don't inadvertently +/// shorten stack lifetimes by removing these loads, since the FAKE_USEs will +/// also no longer be in effect. Running immediately before LiveDebugValues +/// ensures that LDV will have accurate information of the machine location of +/// debug values. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "remove-loads-into-fake-uses" + +STATISTIC(NumLoadsDeleted, "Number of dead load instructions deleted"); +STATISTIC(NumFakeUsesDeleted, "Number of FAKE_USE instructions deleted"); + +class RemoveLoadsIntoFakeUses : public MachineFunctionPass { +public: + static char ID; + + RemoveLoadsIntoFakeUses() : MachineFunctionPass(ID) { + initializeRemoveLoadsIntoFakeUsesPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return "Remove Loads Into Fake Uses"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +char RemoveLoadsIntoFakeUses::ID = 0; +char &llvm::RemoveLoadsIntoFakeUsesID = RemoveLoadsIntoFakeUses::ID; + +INITIALIZE_PASS_BEGIN(RemoveLoadsIntoFakeUses, DEBUG_TYPE, + "Remove Loads Into Fake Uses", false, false) +INITIALIZE_PASS_END(RemoveLoadsIntoFakeUses, DEBUG_TYPE, + "Remove Loads Into Fake Uses", false, false) + +bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) { + // Only `optdebug` functions should contain FAKE_USEs, so don't try to run + // this for other functions. + if (!MF.getFunction().hasFnAttribute(Attribute::OptimizeForDebugging) || + skipFunction(MF.getFunction())) + return false; + + bool AnyChanges = false; + + LiveRegUnits LivePhysRegs; + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + + SmallDenseMap> RegFakeUses; + LivePhysRegs.init(*TRI); + SmallVector Statepoints; + for (MachineBasicBlock *MBB : post_order(&MF)) { + LivePhysRegs.addLiveOuts(*MBB); + + for (MachineInstr &MI : make_early_inc_range(reverse(*MBB))) { + if (MI.isFakeUse()) { + for (const MachineOperand &MO : MI.operands()) { + // Track the Fake Uses that use this register so that we can delete + // them if we delete the corresponding load. + if (MO.isReg()) + RegFakeUses[MO.getReg()].push_back(&MI); + } + // Do not record FAKE_USE uses in LivePhysRegs so that we can recognize + // otherwise-unused loads. + continue; + } + + // If the restore size is not std::nullopt then we are dealing with a + // reload of a spilled register. + if (MI.getRestoreSize(TII)) { + Register Reg = MI.getOperand(0).getReg(); + assert(Reg.isPhysical() && "VReg seen in function with NoVRegs set?"); + // Don't delete live physreg defs, or any reserved register defs. + if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg)) + continue; + // There should be an exact match between the loaded register and the + // FAKE_USE use. If not, this is a load that is unused by anything? It + // should probably be deleted, but that's outside of this pass' scope. + if (RegFakeUses.contains(Reg)) { + LLVM_DEBUG(dbgs() << "RemoveLoadsIntoFakeUses: DELETING: " << MI); + // It is possible that some DBG_VALUE instructions refer to this + // instruction. They will be deleted in the live debug variable + // analysis. + MI.eraseFromParent(); + AnyChanges = true; + ++NumLoadsDeleted; + // Each FAKE_USE now appears to be a fake use of the previous value + // of the loaded register; delete them to avoid incorrectly + // interpreting them as such. + for (MachineInstr *FakeUse : RegFakeUses[Reg]) { + LLVM_DEBUG(dbgs() + << "RemoveLoadsIntoFakeUses: DELETING: " << *FakeUse); + FakeUse->eraseFromParent(); + } + NumFakeUsesDeleted += RegFakeUses[Reg].size(); + RegFakeUses[Reg].clear(); + } + continue; + } + + // In addition to tracking LivePhysRegs, we need to clear RegFakeUses each + // time a register is defined, as existing FAKE_USEs no longer apply to + // that register. + if (!RegFakeUses.empty()) { + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isDef()) { + Register Reg = MO.getReg(); + assert(Reg.isPhysical() && + "VReg seen in function with NoVRegs set?"); + for (MCRegUnit Unit : TRI->regunits(Reg)) + RegFakeUses.erase(Unit); + } + } + } + LivePhysRegs.stepBackward(MI); + } + } + + return AnyChanges; +} diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 067f82c99adca1..162af2d9d708a9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1464,6 +1464,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { updateValueMap(II, ResultReg); return true; } + case Intrinsic::fake_use: + // At -O0, we don't need fake use, so just ignore it. + return true; case Intrinsic::experimental_stackmap: return selectStackmap(II); case Intrinsic::experimental_patchpoint_void: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 221dcfe145594f..b5c80005a0ecc1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2438,6 +2438,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { report_fatal_error("Do not know how to promote this operator's operand!"); case ISD::BITCAST: R = PromoteFloatOp_BITCAST(N, OpNo); break; + case ISD::FAKE_USE: + R = PromoteFloatOp_FAKE_USE(N, OpNo); + break; case ISD::FCOPYSIGN: R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: @@ -2480,6 +2483,13 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo) { return DAG.getBitcast(N->getValueType(0), Convert); } +SDValue DAGTypeLegalizer::PromoteFloatOp_FAKE_USE(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Only Operand 1 must need promotion here"); + SDValue Op = GetPromotedFloat(N->getOperand(OpNo)); + return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::Other, N->getOperand(0), + Op); +} + // Promote Operand 1 of FCOPYSIGN. Operand 0 ought to be handled by // PromoteFloatRes_FCOPYSIGN. SDValue DAGTypeLegalizer::PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo) { @@ -3433,6 +3443,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { "operand!"); case ISD::BITCAST: Res = SoftPromoteHalfOp_BITCAST(N); break; + case ISD::FAKE_USE: + Res = SoftPromoteHalfOp_FAKE_USE(N, OpNo); + break; case ISD::FCOPYSIGN: Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break; @@ -3473,6 +3486,13 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) { return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0); } +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FAKE_USE(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Only Operand 1 must need promotion here"); + SDValue Op = GetSoftPromotedHalf(N->getOperand(OpNo)); + return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::Other, N->getOperand(0), + Op); +} + SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo) { assert(OpNo == 1 && "Only Operand 1 must need promotion here"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index c19a5a4995627a..05971152d535c6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1934,6 +1934,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break; + case ISD::FAKE_USE: + Res = PromoteIntOp_FAKE_USE(N); + break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo); break; @@ -5280,6 +5283,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::BR_CC: Res = ExpandIntOp_BR_CC(N); break; case ISD::BUILD_VECTOR: Res = ExpandOp_BUILD_VECTOR(N); break; case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break; + case ISD::FAKE_USE: + Res = ExpandOp_FAKE_USE(N); + break; case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break; case ISD::SCALAR_TO_VECTOR: Res = ExpandOp_SCALAR_TO_VECTOR(N); break; case ISD::EXPERIMENTAL_VP_SPLAT: @@ -6115,6 +6121,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_SUBVECTOR(SDNode *N) { return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0)); } +// FIXME: We wouldn't need this if clang could promote short integers +// that are arguments to FAKE_USE. +SDValue DAGTypeLegalizer::PromoteIntOp_FAKE_USE(SDNode *N) { + SDLoc dl(N); + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + EVT InVT1 = V1.getValueType(); + SDValue VPromoted = + DAG.getNode(ISD::ANY_EXTEND, dl, + TLI.getTypeToTransformTo(*DAG.getContext(), InVT1), V1); + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), V0, VPromoted); +} + SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 1088db4bdbe0b3..4577346a02d605 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -391,6 +391,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue PromoteIntOp_INSERT_SUBVECTOR(SDNode *N); + SDValue PromoteIntOp_FAKE_USE(SDNode *N); SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntOp_ScalarOp(SDNode *N); SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); @@ -755,6 +756,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { bool PromoteFloatOperand(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); + SDValue PromoteFloatOp_FAKE_USE(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo); @@ -800,6 +802,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_BITCAST(SDNode *N); + SDValue SoftPromoteHalfOp_FAKE_USE(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N); SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N); @@ -877,6 +880,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecOp_VECREDUCE(SDNode *N); SDValue ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N); SDValue ScalarizeVecOp_CMP(SDNode *N); + SDValue ScalarizeVecOp_FAKE_USE(SDNode *N); //===--------------------------------------------------------------------===// // Vector Splitting Support: LegalizeVectorTypes.cpp @@ -964,6 +968,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); + SDValue SplitVecOp_FAKE_USE(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo); @@ -1069,6 +1074,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_EXTEND_VECTOR_INREG(SDNode *N); + SDValue WidenVecOp_FAKE_USE(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_VP_STRIDED_STORE(SDNode *N, unsigned OpNo); @@ -1198,6 +1204,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ExpandOp_BITCAST (SDNode *N); SDValue ExpandOp_BUILD_VECTOR (SDNode *N); SDValue ExpandOp_EXTRACT_ELEMENT (SDNode *N); + SDValue ExpandOp_FAKE_USE(SDNode *N); SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N); SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N); SDValue ExpandOp_NormalStore (SDNode *N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index a55364ea2c4e5b..b402e823762764 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -403,6 +403,17 @@ SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) { return N->getConstantOperandVal(1) ? Hi : Lo; } +// Split the integer operand in two and create a second FAKE_USE node for +// the other half. The original SDNode is updated in place. +SDValue DAGTypeLegalizer::ExpandOp_FAKE_USE(SDNode *N) { + SDValue Lo, Hi; + SDValue Chain = N->getOperand(0); + GetExpandedOp(N->getOperand(1), Lo, Hi); + SDValue LoUse = DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, Chain, Lo); + DAG.UpdateNodeOperands(N, LoUse, Hi); + return SDValue(N, 0); +} + SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) { // The vector type is legal but the element type needs expansion. EVT VecVT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 475d5806467d98..4c6da7c5df6b40 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -746,6 +746,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::BITCAST: Res = ScalarizeVecOp_BITCAST(N); break; + case ISD::FAKE_USE: + Res = ScalarizeVecOp_FAKE_USE(N); + break; case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: @@ -846,6 +849,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) { N->getValueType(0), Elt); } +// Need to legalize vector operands of fake uses. Must be <1 x ty>. +SDValue DAGTypeLegalizer::ScalarizeVecOp_FAKE_USE(SDNode *N) { + assert(N->getOperand(1).getValueType().getVectorNumElements() == 1 && + "Fake Use: Unexpected vector type!"); + SDValue Elt = GetScalarizedVector(N->getOperand(1)); + return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0), Elt); +} + /// If the input is a vector that needs to be scalarized, it must be <1 x ty>. /// Do the operation on the element instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { @@ -3291,6 +3302,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { Res = SplitVecOp_CMP(N); break; + case ISD::FAKE_USE: + Res = SplitVecOp_FAKE_USE(N); + break; case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: @@ -3505,6 +3519,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } +// Split a FAKE_USE use of a vector into FAKE_USEs of hi and lo part. +SDValue DAGTypeLegalizer::SplitVecOp_FAKE_USE(SDNode *N) { + SDValue Lo, Hi; + GetSplitVector(N->getOperand(1), Lo, Hi); + SDValue Chain = + DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0), Lo); + return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, Chain, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) { // For example, i64 = BITCAST v4i16 on alpha. Typically the vector will // end up being split all the way down to individual components. Convert the @@ -6466,6 +6489,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { report_fatal_error("Do not know how to widen this operator's operand!"); case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break; + case ISD::FAKE_USE: + Res = WidenVecOp_FAKE_USE(N); + break; case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break; case ISD::INSERT_SUBVECTOR: Res = WidenVecOp_INSERT_SUBVECTOR(N); break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; @@ -6851,6 +6877,16 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { return CreateStackStoreLoad(InOp, VT); } +// Vectors with sizes that are not powers of 2 need to be widened to the +// next largest power of 2. For example, we may get a vector of 3 32-bit +// integers or of 6 16-bit integers, both of which have to be widened to a +// 128-bit vector. +SDValue DAGTypeLegalizer::WidenVecOp_FAKE_USE(SDNode *N) { + SDValue WidenedOp = GetWidenedVector(N->getOperand(1)); + return DAG.getNode(ISD::FAKE_USE, SDLoc(), MVT::Other, N->getOperand(0), + WidenedOp); +} + SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ad24704d940a36..521a4fee8aafe0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1622,6 +1622,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef Values, SDValue N = NodeMap[V]; if (!N.getNode() && isa(V)) // Check unused arguments map. N = UnusedArgNodeMap[V]; + if (N.getNode()) { // Only emit func arg dbg value for non-variadic dbg.values for now. if (!IsVariadic && @@ -7703,6 +7704,38 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } + case Intrinsic::fake_use: { + Value *V = I.getArgOperand(0); + SDValue Ops[2]; + // For Values not declared or previously used in this basic block, the + // NodeMap will not have an entry, and `getValue` will assert if V has no + // valid register value. + auto FakeUseValue = [&]() -> SDValue { + SDValue &N = NodeMap[V]; + if (N.getNode()) + return N; + + // If there's a virtual register allocated and initialized for this + // value, use it. + if (SDValue copyFromReg = getCopyFromRegs(V, V->getType())) + return copyFromReg; + // FIXME: Do we want to preserve constants? It seems pointless. + if (isa(V)) + return getValue(V); + return SDValue(); + }(); + if (!FakeUseValue || FakeUseValue.isUndef()) + return; + Ops[0] = getRoot(); + Ops[1] = FakeUseValue; + // Also, do not translate a fake use with an undef operand, or any other + // empty SDValues. + if (!Ops[1] || Ops[1].isUndef()) + return; + DAG.setRoot(DAG.getNode(ISD::FAKE_USE, sdl, MVT::Other, Ops)); + return; + } + case Intrinsic::eh_exceptionpointer: case Intrinsic::eh_exceptioncode: { // Get the exception pointer vreg, copy from it, and resize it to fit. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 001f782f209fdb..a253d1a0e20170 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -454,6 +454,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UBSANTRAP: return "ubsantrap"; case ISD::LIFETIME_START: return "lifetime.start"; case ISD::LIFETIME_END: return "lifetime.end"; + case ISD::FAKE_USE: + return "fake_use"; case ISD::PSEUDO_PROBE: return "pseudoprobe"; case ISD::GC_TRANSITION_START: return "gc_transition.start"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 09bde54b9aaa5d..8e268d4f4968ea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -804,6 +804,50 @@ static void reportFastISelFailure(MachineFunction &MF, LLVM_DEBUG(dbgs() << R.getMsg() << "\n"); } +// Detect any fake uses that follow a tail call and move them before the tail +// call. Ignore fake uses that use values that are def'd by or after the tail +// call. +static void preserveFakeUses(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + BasicBlock::iterator I = End; + if (--I == Begin || !isa(*I)) + return; + // Detect whether there are any fake uses trailing a (potential) tail call. + bool HaveFakeUse = false; + bool HaveTailCall = false; + do { + if (const CallInst *CI = dyn_cast(--I)) + if (CI->isTailCall()) { + HaveTailCall = true; + break; + } + if (const IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::fake_use) + HaveFakeUse = true; + } while (I != Begin); + + // If we didn't find any tail calls followed by fake uses, we are done. + if (!HaveTailCall || !HaveFakeUse) + return; + + SmallVector FakeUses; + // Record the fake uses we found so we can move them to the front of the + // tail call. Ignore them if they use a value that is def'd by or after + // the tail call. + for (BasicBlock::iterator Inst = I; Inst != End; Inst++) { + if (IntrinsicInst *FakeUse = dyn_cast(Inst); + FakeUse && FakeUse->getIntrinsicID() == Intrinsic::fake_use) { + if (auto UsedDef = dyn_cast(FakeUse->getOperand(0)); + !UsedDef || UsedDef->getParent() != I->getParent() || + UsedDef->comesBefore(&*I)) + FakeUses.push_back(FakeUse); + } + } + + for (auto *Inst : FakeUses) + Inst->moveBefore(*Inst->getParent(), I); +} + void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, BasicBlock::const_iterator End, bool &HadTailCall) { @@ -1665,6 +1709,16 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { FuncInfo->VisitedBBs[LLVMBB->getNumber()] = true; } + // Fake uses that follow tail calls are dropped. To avoid this, move + // such fake uses in front of the tail call, provided they don't + // use anything def'd by or after the tail call. + { + BasicBlock::iterator BBStart = + const_cast(LLVMBB)->getFirstNonPHI()->getIterator(); + BasicBlock::iterator BBEnd = const_cast(LLVMBB)->end(); + preserveFakeUses(BBStart, BBEnd); + } + BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI()->getIterator(); BasicBlock::const_iterator const End = LLVMBB->end(); @@ -2448,6 +2502,13 @@ void SelectionDAGISel::Select_UNDEF(SDNode *N) { CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0)); } +// Use the generic target FAKE_USE target opcode. The chain operand +// must come last, because InstrEmitter::AddOperand() requires it. +void SelectionDAGISel::Select_FAKE_USE(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::FAKE_USE, N->getValueType(0), + N->getOperand(1), N->getOperand(0)); +} + void SelectionDAGISel::Select_FREEZE(SDNode *N) { // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now. // If FREEZE instruction is added later, the code below must be changed as @@ -3219,6 +3280,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::UNDEF: Select_UNDEF(NodeToMatch); return; + case ISD::FAKE_USE: + Select_FAKE_USE(NodeToMatch); + return; case ISD::FREEZE: Select_FREEZE(NodeToMatch); return; diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 1d52ebe6717f04..c0b834650d73b0 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1206,6 +1206,7 @@ void TargetPassConfig::addMachinePasses() { // addPreEmitPass. Maybe only pass "false" here for those targets? addPass(&FuncletLayoutID); + addPass(&RemoveLoadsIntoFakeUsesID); addPass(&StackMapLivenessID); addPass(&LiveDebugValuesID); addPass(&MachineSanitizerBinaryMetadataID); diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 6f0f3f244c050c..62d88ce21657b2 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -1171,7 +1171,10 @@ Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const { const Instruction * Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const { for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode()) - if (!isa(I) && !(SkipPseudoOp && isa(I))) + if (!isa(I) && + !(SkipPseudoOp && isa(I)) && + !(isa(I) && + cast(I)->getIntrinsicID() == Intrinsic::fake_use)) return I; return nullptr; } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 2c0f10a34f919d..79b3ca3b6a5a7e 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5128,6 +5128,7 @@ void Verifier::visitInstruction(Instruction &I) { F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void || F->getIntrinsicID() == Intrinsic::experimental_patchpoint || + F->getIntrinsicID() == Intrinsic::fake_use || F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint || F->getIntrinsicID() == Intrinsic::wasm_rethrow || IsAttachedCallOperand(F, CBI, i), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 097e29527eed9f..e86d3771bd2f26 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -315,6 +315,7 @@ void NVPTXPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); disablePass(&ShrinkWrapID); + disablePass(&RemoveLoadsIntoFakeUsesID); addPass(createNVPTXAAWrapperPass()); addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 48a2ce89bad390..7058b15d53aa0b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -140,6 +140,7 @@ void SPIRVPassConfig::addPostRegAlloc() { disablePass(&ShrinkWrapID); disablePass(&LiveDebugValuesID); disablePass(&MachineLateInstrsCleanupID); + disablePass(&RemoveLoadsIntoFakeUsesID); // Do not work with OpPhi. disablePass(&BranchFolderPassID); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 23539a5f4b26f1..73765f8fa0092c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -552,6 +552,7 @@ void WebAssemblyPassConfig::addPostRegAlloc() { disablePass(&StackMapLivenessID); disablePass(&PatchableFunctionID); disablePass(&ShrinkWrapID); + disablePass(&RemoveLoadsIntoFakeUsesID); // This pass hurts code size for wasm because it can generate irreducible // control flow. diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index 02c3ca9839fc2d..ea94a4be32b2fa 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -432,6 +432,24 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { if (MI.isCall()) FPInstClass = X86II::SpecialFP; + // A fake_use with a floating point pseudo register argument that is + // killed must behave like any other floating point operation and pop + // the floating point stack (this is done in handleSpecialFP()). + // Fake_use is, however, unusual, in that sometimes its operand is not + // killed because a later instruction (probably a return) will use it. + // It is this instruction that will pop the stack. + // In this scenario we can safely remove the fake_use's operand + // (it is live anyway). + if (MI.isFakeUse()) { + const MachineOperand &MO = MI.getOperand(0); + if (MO.isReg() && X86::RFP80RegClass.contains(MO.getReg())) { + if (MO.isKill()) + FPInstClass = X86II::SpecialFP; + else + MI.removeOperand(0); + } + } + if (FPInstClass == X86II::NotFP) continue; // Efficiently ignore non-fp insts! @@ -1737,6 +1755,20 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { // Don't delete the inline asm! return; } + + // FAKE_USE must pop its register operand off the stack if it is killed, + // because this constitutes the register's last use. If the operand + // is not killed, it will have its last use later, so we leave it alone. + // In either case we remove the operand so later passes don't see it. + case TargetOpcode::FAKE_USE: { + assert(MI.getNumExplicitOperands() == 1 && + "FAKE_USE must have exactly one operand"); + if (MI.getOperand(0).isKill()) { + freeStackSlotBefore(Inst, getFPReg(MI.getOperand(0))); + } + MI.removeOperand(0); + return; + } } Inst = MBB->erase(Inst); // Remove the pseudo instruction diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 26b62cb79cdedf..2310cb3a7decbb 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3802,6 +3802,12 @@ class AggLoadStoreRewriter : public InstVisitor { struct LoadOpSplitter : public OpSplitter { AAMDNodes AATags; + // A vector to hold the split components that we want to emit + // separate fake uses for. + SmallVector Components; + // A vector to hold all the fake uses of the struct that we are splitting. + // Usually there should only be one, but we are handling the general case. + SmallVector FakeUses; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, @@ -3826,10 +3832,32 @@ class AggLoadStoreRewriter : public InstVisitor { GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) Load->setAAMetadata( AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL)); + // Record the load so we can generate a fake use for this aggregate + // component. + Components.push_back(Load); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); } + + // Stash the fake uses that use the value generated by this instruction. + void recordFakeUses(LoadInst &LI) { + for (Use &U : LI.uses()) + if (auto *II = dyn_cast(U.getUser())) + if (II->getIntrinsicID() == Intrinsic::fake_use) + FakeUses.push_back(II); + } + + // Replace all fake uses of the aggregate with a series of fake uses, one + // for each split component. + void emitFakeUses() { + for (Instruction *I : FakeUses) { + IRB.SetInsertPoint(I); + for (auto *V : Components) + IRB.CreateIntrinsic(Intrinsic::fake_use, {}, {V}); + I->eraseFromParent(); + } + } }; bool visitLoadInst(LoadInst &LI) { @@ -3841,8 +3869,10 @@ class AggLoadStoreRewriter : public InstVisitor { LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), getAdjustedAlignment(&LI, 0), DL, IRB); + Splitter.recordFakeUses(LI); Value *V = PoisonValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); + Splitter.emitFakeUses(); Visited.erase(&LI); LI.replaceAllUsesWith(V); LI.eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 47e3c03288d979..dc9ca1423f3e79 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -513,6 +513,12 @@ void PruningFunctionCloner::CloneBlock( for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE; ++II) { + // Don't clone fake_use as it may suppress many optimizations + // due to inlining, especially SROA. + if (auto *IntrInst = dyn_cast(II)) + if (IntrInst->getIntrinsicID() == Intrinsic::fake_use) + continue; + Instruction *NewInst = cloneInstruction(II); NewInst->insertInto(NewBB, NewBB->end()); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index e4809cd4bb44db..d0669e44f821b3 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3491,6 +3491,9 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To, unsigned Count = 0; for (Use &U : llvm::make_early_inc_range(From->uses())) { + auto *II = dyn_cast(U.getUser()); + if (II && II->getIntrinsicID() == Intrinsic::fake_use) + continue; if (!ShouldReplace(Root, U)) continue; LLVM_DEBUG(dbgs() << "Replace dominated use of '"; diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 5251eb86bca926..1b7912fdf5e304 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -80,7 +80,8 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { if (SI->isVolatile()) return false; } else if (const IntrinsicInst *II = dyn_cast(U)) { - if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable() && + II->getIntrinsicID() != Intrinsic::fake_use) return false; } else if (const BitCastInst *BCI = dyn_cast(U)) { if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI)) diff --git a/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll b/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll index 2370fe1468b4ed..5d304d1569d3f6 100644 --- a/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll +++ b/llvm/test/Analysis/ScalarEvolution/flags-from-poison-dbg.ll @@ -16,7 +16,7 @@ for.body.lr.ph: ; preds = %entry for.body: ; preds = %for.inc, %for.body.lr.ph %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] %add = add nsw i32 %i.02, 50, !dbg !16 - call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !18, metadata !19), !dbg !20 + tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !18, metadata !19), !dbg !20 %idxprom = sext i32 %add to i64, !dbg !21 ; CHECK: %idxprom = sext i32 %add to i64 diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index ba611493e1a76e..ec6f24a5650fa3 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -69,6 +69,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: Workaround A53 erratum 835769 pass ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 3465b717261cf5..ffbe3dd377109f 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -224,6 +224,7 @@ ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Workaround A53 erratum 835769 pass ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 29e8ebdafb5871..7bf1b8746fd87b 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -143,6 +143,7 @@ ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: Branch relaxation pass ; GCN-O0-NEXT: Register Usage Information Collector Pass +; GCN-O0-NEXT: Remove Loads Into Fake Uses ; GCN-O0-NEXT: Live DEBUG_VALUE analysis ; GCN-O0-NEXT: Machine Sanitizer Binary Metadata ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis @@ -420,6 +421,7 @@ ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass ; GCN-O1-NEXT: Register Usage Information Collector Pass +; GCN-O1-NEXT: Remove Loads Into Fake Uses ; GCN-O1-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-NEXT: Machine Sanitizer Binary Metadata ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis @@ -725,6 +727,7 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass +; GCN-O1-OPTS-NEXT: Remove Loads Into Fake Uses ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-OPTS-NEXT: Machine Sanitizer Binary Metadata ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis @@ -1036,6 +1039,7 @@ ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass ; GCN-O2-NEXT: Register Usage Information Collector Pass +; GCN-O2-NEXT: Remove Loads Into Fake Uses ; GCN-O2-NEXT: Live DEBUG_VALUE analysis ; GCN-O2-NEXT: Machine Sanitizer Binary Metadata ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis @@ -1359,6 +1363,7 @@ ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass ; GCN-O3-NEXT: Register Usage Information Collector Pass +; GCN-O3-NEXT: Remove Loads Into Fake Uses ; GCN-O3-NEXT: Live DEBUG_VALUE analysis ; GCN-O3-NEXT: Machine Sanitizer Binary Metadata ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 9b983d96f79330..819623d3fcc5a3 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -193,6 +193,7 @@ ; CHECK-NEXT: ARM block placement ; CHECK-NEXT: optimise barriers pass ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll index 38c1dbcb1075fa..24bd4c75a9821e 100644 --- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll index 50f154663177d0..53cdbd18f9b907 100644 --- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll @@ -166,6 +166,7 @@ ; LAXX-NEXT: Implement the 'patchable-function' attribute ; LAXX-NEXT: Branch relaxation pass ; LAXX-NEXT: Contiguously Lay Out Funclets +; LAXX-NEXT: Remove Loads Into Fake Uses ; LAXX-NEXT: StackMap Liveness Analysis ; LAXX-NEXT: Live DEBUG_VALUE analysis ; LAXX-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir new file mode 100644 index 00000000000000..7eb8915f26a80f --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir @@ -0,0 +1,99 @@ +# In certain cases CodeGenPrepare folds a return instruction into +# the return block's predecessor blocks and subsequently deletes the return block. +# The purpose of this is to enable tail call optimization in the predecessor blocks. +# Removal of the return block also removes fake use instructions that were present +# in the return block, potentially causing debug information to be lost. +# +# The fix is to clone any fake use instructions that are not dominated by definitions +# in the return block itself into the predecessor blocks. This test enures that we do so. +# +# Generated from the following source with +# clang -fextend-lifetimes -S -emit-llvm -O2 -mllvm -stop-before=codegenprepare -o test.mir test.c +# +# extern int f0(); +# extern int f1(); +# +# int foo(int i) { +# int temp = i; +# if (temp == 0) +# temp = f0(); +# else +# temp = f1(); +# return temp; +# } +# +# RUN: llc -run-pass=codegenprepare -o - %s | FileCheck %s +# +# CHECK: define{{.*}}foo +# CHECK: if.then: +# CHECK-NEXT: call{{.*}}fake.use(i32 %i) +# CHECK-NEXT: tail call i32{{.*}}@f0 +# CHECK-NEXT: ret +# CHECK: if.else: +# CHECK-NEXT: call{{.*}}fake.use(i32 %i) +# CHECK-NEXT: tail call i32{{.*}}@f1 +# CHECK-NEXT: ret + +--- | + define hidden i32 @foo(i32 %i) local_unnamed_addr optdebug { + entry: + %cmp = icmp eq i32 %i, 0 + br i1 %cmp, label %if.then, label %if.else + + if.then: + %call = tail call i32 (...) @f0() + br label %if.end + + if.else: + %call1 = tail call i32 (...) @f1() + br label %if.end + + if.end: + %temp.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ] + notail call void (...) @llvm.fake.use(i32 %temp.0) + notail call void (...) @llvm.fake.use(i32 %i) + ret i32 %temp.0 + } + declare i32 @f0(...) local_unnamed_addr + declare i32 @f1(...) local_unnamed_addr + +... +--- +name: foo +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + +... diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll index 4a17384e499936..5853647bf3b9f3 100644 --- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll @@ -60,6 +60,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: PowerPC Pre-Emit Peephole ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll index 39b23a57513d9d..21bd4bb8502c3d 100644 --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -214,6 +214,7 @@ ; CHECK-NEXT: PowerPC Pre-Emit Peephole ; CHECK-NEXT: PowerPC Early-Return Creation ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll index 7473809a2c5d2e..84c7f3f987c065 100644 --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -63,6 +63,7 @@ ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: RISC-V Make Compressible ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 44c270fdc3c257..5d14d14d216244 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -189,6 +189,7 @@ ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: RISC-V Make Compressible ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 98b86384b8443d..4c99dd830b442e 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -71,6 +71,7 @@ ; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll new file mode 100644 index 00000000000000..7a95c38801837c --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll @@ -0,0 +1,15 @@ +; assert in DAGlegalizer with fake use of half precision float. +; Changes to half float promotion. +; RUN: llc -stop-after=finalize-isel -o - %s | FileCheck %s +; +; CHECK: bb.0.entry: +; CHECK-NEXT: %0:fr16 = FsFLD0SH +; CHECK-NEXT: FAKE_USE killed %0 +; +target triple = "x86_64-unknown-unknown" + +define void @_Z6doTestv() local_unnamed_addr optdebug { +entry: + tail call void (...) @llvm.fake.use(half 0xH0000) + ret void +} diff --git a/llvm/test/CodeGen/X86/fake-use-ld.ll b/llvm/test/CodeGen/X86/fake-use-ld.ll new file mode 100644 index 00000000000000..86e7235091dd1c --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-ld.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; Checks that fake uses of the FP stack do not cause a crash. +; +; /*******************************************************************/ +; extern long double foo(long double, long double, long double); +; +; long double actual(long double p1, long double p2, long double p3) { +; return fmal(p1, p2, p3); +; } +; /*******************************************************************/ + +define x86_fp80 @actual(x86_fp80 %p1, x86_fp80 %p2, x86_fp80 %p3) optdebug { +; +; CHECK: actual +; +entry: + %p1.addr = alloca x86_fp80, align 16 + %p2.addr = alloca x86_fp80, align 16 + %p3.addr = alloca x86_fp80, align 16 + store x86_fp80 %p1, ptr %p1.addr, align 16 + store x86_fp80 %p2, ptr %p2.addr, align 16 + store x86_fp80 %p3, ptr %p3.addr, align 16 + %0 = load x86_fp80, ptr %p1.addr, align 16 + %1 = load x86_fp80, ptr %p2.addr, align 16 + %2 = load x86_fp80, ptr %p3.addr, align 16 +; +; CHECK: callq{{.*}}foo +; + %3 = call x86_fp80 @foo(x86_fp80 %0, x86_fp80 %1, x86_fp80 %2) + %4 = load x86_fp80, ptr %p1.addr, align 16 + call void (...) @llvm.fake.use(x86_fp80 %4) + %5 = load x86_fp80, ptr %p2.addr, align 16 + call void (...) @llvm.fake.use(x86_fp80 %5) + %6 = load x86_fp80, ptr %p3.addr, align 16 + call void (...) @llvm.fake.use(x86_fp80 %6) +; +; CHECK: ret +; + ret x86_fp80 %3 +} + +declare x86_fp80 @foo(x86_fp80, x86_fp80, x86_fp80) diff --git a/llvm/test/CodeGen/X86/fake-use-scheduler.mir b/llvm/test/CodeGen/X86/fake-use-scheduler.mir new file mode 100644 index 00000000000000..7e55f1d79aa7b6 --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-scheduler.mir @@ -0,0 +1,123 @@ +# Prevent the machine scheduler from moving instructions past FAKE_USE. +# RUN: llc -run-pass machine-scheduler -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s +# REQUIRES: asserts +# +# We make sure that, beginning with the first FAKE_USE instruction, +# no changes to the sequence of instructions are undertaken by the +# scheduler. We don't bother to check that the order of the FAKE_USEs +# remains the same. They should, but it is irrelevant. +# +# CHECK: ********** MI Scheduling ********** +# CHECK-NEXT: foo:%bb.0 entry +# CHECK-NEXT: From: %0:gr64 = COPY $rdi +# CHECK-NEXT: To: FAKE_USE %5:gr64 +# CHECK-NEXT: RegionInstrs: 7 +# +# CHECK: ********** MI Scheduling ********** +# CHECK-NEXT: bar:%bb.0 entry +# CHECK-NEXT: From: %0:gr64 = COPY $rdi +# CHECK-NEXT: To: RET 0, killed $rax +# CHECK-NEXT: RegionInstrs: 7 +# +--- | + ; ModuleID = 'test.ll' + source_filename = "test.ll" + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + + @glb = common dso_local local_unnamed_addr global [100 x i32] zeroinitializer, align 16 + + define dso_local i64 @foo(ptr %p) local_unnamed_addr optdebug { + entry: + %0 = load i32, ptr @glb, align 16 + store i32 %0, ptr %p, align 4 + %conv = sext i32 %0 to i64 + %1 = load i32, ptr getelementptr inbounds ([100 x i32], ptr @glb, i64 0, i64 1), align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %p, i64 1 + store i32 %1, ptr %arrayidx1, align 4 + %conv2 = sext i32 %1 to i64 + %add3 = add nsw i64 %conv2, %conv + notail call void (...) @llvm.fake.use(i64 %add3) + notail call void (...) @llvm.fake.use(i32 %1) + notail call void (...) @llvm.fake.use(i32 %0) + notail call void (...) @llvm.fake.use(ptr %p) + ret i64 %add3 + } + + define dso_local i64 @bar(ptr %p) local_unnamed_addr optdebug { + entry: + %0 = load i32, ptr @glb, align 16 + store i32 %0, ptr %p, align 4 + %conv = sext i32 %0 to i64 + %1 = load i32, ptr getelementptr inbounds ([100 x i32], ptr @glb, i64 0, i64 1), align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %p, i64 1 + store i32 %1, ptr %arrayidx1, align 4 + %conv2 = sext i32 %1 to i64 + %add3 = add nsw i64 %conv2, %conv + ret i64 %add3 + } + + ; Function Attrs: nocallback nofree nosync nounwind willreturn + declare void @llvm.stackprotector(ptr, ptr) + +... +--- +name: foo +alignment: 16 +tracksRegLiveness: true +debugInstrRef: true +registers: + - { id: 0, class: gr64, preferred-register: '' } + - { id: 1, class: gr64_with_sub_8bit, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr64_with_sub_8bit, preferred-register: '' } + - { id: 4, class: gr32, preferred-register: '' } + - { id: 5, class: gr64, preferred-register: '' } +liveins: + - { reg: '$rdi', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $rdi + + %0:gr64 = COPY $rdi + %1:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb, $noreg + MOV32mr %0, 1, $noreg, 0, $noreg, %1.sub_32bit + %3:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb + 4, $noreg + MOV32mr %0, 1, $noreg, 4, $noreg, %3.sub_32bit + %5:gr64 = COPY %3 + %5:gr64 = nsw ADD64rr %5, %1, implicit-def dead $eflags + FAKE_USE %5 + FAKE_USE %3.sub_32bit + FAKE_USE %1.sub_32bit + FAKE_USE %0 + $rax = COPY %5 + RET 0, killed $rax + +... +--- +name: bar +alignment: 16 +tracksRegLiveness: true +debugInstrRef: true +registers: + - { id: 0, class: gr64, preferred-register: '' } + - { id: 1, class: gr64_with_sub_8bit, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr64_with_sub_8bit, preferred-register: '' } + - { id: 4, class: gr32, preferred-register: '' } + - { id: 5, class: gr64_with_sub_8bit, preferred-register: '' } +liveins: + - { reg: '$rdi', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $rdi + + %0:gr64 = COPY $rdi + %1:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb, $noreg + MOV32mr %0, 1, $noreg, 0, $noreg, %1.sub_32bit + %5:gr64_with_sub_8bit = MOVSX64rm32 $rip, 1, $noreg, @glb + 4, $noreg + MOV32mr %0, 1, $noreg, 4, $noreg, %5.sub_32bit + %5:gr64_with_sub_8bit = nsw ADD64rr %5, %1, implicit-def dead $eflags + $rax = COPY %5 + RET 0, killed $rax + +... diff --git a/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll b/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll new file mode 100644 index 00000000000000..45a210ef391009 --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-simple-tail-call.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -o - \ +; RUN: | FileCheck %s --implicit-check-not=TAILCALL +; Generated with: clang -emit-llvm -O2 -S -fextend-lifetimes test.cpp -o - +; =========== test.cpp =============== +; extern int bar(int); +; int foo1(int i) +; { +; return bar(i); +; } +; =========== test.cpp =============== + +; CHECK: TAILCALL + +; ModuleID = 'test.cpp' +source_filename = "test.cpp" + +define i32 @_Z4foo1i(i32 %i) local_unnamed_addr optdebug { +entry: + %call = tail call i32 @_Z3bari(i32 %i) + tail call void (...) @llvm.fake.use(i32 %i) + ret i32 %call +} + +declare i32 @_Z3bari(i32) local_unnamed_addr diff --git a/llvm/test/CodeGen/X86/fake-use-suppress-load.ll b/llvm/test/CodeGen/X86/fake-use-suppress-load.ll new file mode 100644 index 00000000000000..c1b442ebd79ffa --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-suppress-load.ll @@ -0,0 +1,14 @@ +; Suppress redundant loads feeding into fake uses. +; RUN: llc -filetype=asm -o - %s --mtriple=x86_64-unknown-unknown | FileCheck %s +; Windows ABI works differently, there's no offset. +; +; Look for the spill +; CHECK: movq %r{{[a-z]+,}} -{{[0-9]+\(%rsp\)}} +; CHECK-NOT: movq -{{[0-9]+\(%rsp\)}}, %r{{[a-z]+}} + +define dso_local i32 @f(ptr %p) local_unnamed_addr optdebug { +entry: + call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #1 + notail call void (...) @llvm.fake.use(ptr %p) + ret i32 4 +} diff --git a/llvm/test/CodeGen/X86/fake-use-tailcall.ll b/llvm/test/CodeGen/X86/fake-use-tailcall.ll new file mode 100644 index 00000000000000..10bb22e1b564ab --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-tailcall.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -stop-after=finalize-isel - | FileCheck %s --implicit-check-not FAKE_USE +; Fake uses following tail calls should be pulled in front +; of the TCRETURN instruction. Fake uses using something defined by +; the tail call or after it should be suppressed. + +; CHECK: name:{{ +}}bar +; CHECK: body: +; CHECK: bb.0.{{.*}}: +; CHECK: %0:{{.*}}= COPY +; CHECK: FAKE_USE %0 +; CHECK: TCRETURN + +; CHECK: name:{{ +}}baz +; CHECK: body: +; CHECK: bb.0.{{.*}}: +; CHECK: %0:{{.*}}= COPY +; CHECK: FAKE_USE %0 +; CHECK: TCRETURN + +define void @bar(i32 %v) optdebug { +entry: + %call = tail call i32 @_Z3fooi(i32 %v) + %mul = mul nsw i32 %call, 3 + notail call void (...) @llvm.fake.use(i32 %mul) + notail call void (...) @llvm.fake.use(i32 %call) + notail call void (...) @llvm.fake.use(i32 %v) + ret void +} + +define i32 @baz(i32 %v) optdebug { +entry: + %call = tail call i32 @_Z3fooi(i32 %v) + notail call void (...) @llvm.fake.use(i32 %v) + ret i32 %call +} + +declare i32 @_Z3fooi(i32) local_unnamed_addr diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll new file mode 100644 index 00000000000000..cb46ccc8cac11c --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-vector.ll @@ -0,0 +1,39 @@ +; assert in DAGlegalizer with fake use of 1-element vectors. +; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s +; +; ModuleID = 't2.cpp' +; source_filename = "t2.cpp" +; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +; +; Check that we get past ISel and generate FAKE_USE machine instructions for +; one-element vectors. +; +; CHECK: bb.0.entry: +; CHECK-DAG: %1:gr64 = COPY $rdi +; CHECK-DAG: %0:vr128 = COPY $xmm0 +; CHECK: %2:vr64 = +; CHECK-DAG: FAKE_USE %1 +; CHECK-DAG: FAKE_USE %0 +; CHECK: RET + + +target triple = "x86_64-unknown-unknown" + +; Function Attrs: nounwind sspstrong uwtable +define <4 x float> @_Z3runDv4_fDv1_x(<4 x float> %r, i64 %b.coerce) local_unnamed_addr #0 { +entry: + %0 = insertelement <1 x i64> undef, i64 %b.coerce, i32 0 + %1 = bitcast i64 %b.coerce to <1 x i64> + %2 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %r, <1 x i64> %1) + tail call void (...) @llvm.fake.use(<1 x i64> %0) + tail call void (...) @llvm.fake.use(<4 x float> %r) + ret <4 x float> %2 +} + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) + +; Function Attrs: nounwind +declare void @llvm.fake.use(...) + +attributes #0 = { "target-cpu"="btver2" optdebug } diff --git a/llvm/test/CodeGen/X86/fake-use-vector2.ll b/llvm/test/CodeGen/X86/fake-use-vector2.ll new file mode 100644 index 00000000000000..6f2d3a5566dc67 --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-vector2.ll @@ -0,0 +1,27 @@ +; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s +; +; Make sure we can split vectors that are used as operands of FAKE_USE. + +; Generated from: +; +; typedef long __attribute__((ext_vector_type(8))) long8; +; void test0() { long8 id208 {0, 1, 2, 3, 4, 5, 6, 7}; } + +; ModuleID = 't5.cpp' +source_filename = "t5.cpp" + + +; CHECK: %0:vr256 = VMOV +; CHECK: %1:vr256 = VMOV +; CHECK-DAG: FAKE_USE killed %1 +; CHECK-DAG: FAKE_USE killed %0 +; CHECK: RET +define void @_Z5test0v() local_unnamed_addr #0 { +entry: + tail call void (...) @llvm.fake.use(<8 x i64> ) #1 + ret void +} + +declare void @llvm.fake.use(...) + +attributes #0 = { "target-cpu"="btver2" optdebug } diff --git a/llvm/test/CodeGen/X86/fake-use-zero-length.ll b/llvm/test/CodeGen/X86/fake-use-zero-length.ll new file mode 100644 index 00000000000000..e8c6791b8edff2 --- /dev/null +++ b/llvm/test/CodeGen/X86/fake-use-zero-length.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -stop-after=finalize-isel | FileCheck %s --implicit-check-not=FAKE_USE +; +; Make sure SelectionDAG does not crash handling fake uses of zero-length arrays +; and structs. Check also that they are not propagated. +; +; Generated from the following source with +; clang -fextend-lifetimes -S -emit-llvm -O2 -mllvm -stop-after=safe-stack -o test.mir test.cpp +; +; int main () +; { int array[0]; } +; +; +; CHECK: liveins: $[[IN_REG:[a-zA-Z0-9]+]] +; CHECK: %[[IN_VREG:[a-zA-Z0-9]+]]:gr32 = COPY $[[IN_REG]] +; CHECK: FAKE_USE %[[IN_VREG]] + +source_filename = "test.ll" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define hidden i32 @main([0 x i32] %zero, [1 x i32] %one) local_unnamed_addr optdebug { +entry: + notail call void (...) @bar([0 x i32] %zero) + notail call void (...) @baz([1 x i32] %one) + notail call void (...) @llvm.fake.use([0 x i32] %zero) + notail call void (...) @llvm.fake.use([1 x i32] %one) + ret i32 0 +} + +declare void @bar([0 x i32] %a) +declare void @baz([1 x i32] %a) diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 12c16a03b134c8..545640b7661691 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -211,6 +211,7 @@ ; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction ; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: Machine Sanitizer Binary Metadata diff --git a/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll b/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll new file mode 100644 index 00000000000000..65a64583096959 --- /dev/null +++ b/llvm/test/DebugInfo/AArch64/fake-use-global-isel.ll @@ -0,0 +1,98 @@ +; REQUIRES: object-emission + +; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b' +; to extend all the way to the end of the function. +; Duplicates `DebugInfo/X86/fake-use.ll` for global-isel. + +; RUN: %llc_dwarf -O2 --global-isel=1 -mtriple=aarch64--linux-gnu -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: %python %p/../Inputs/check-fake-use.py %t +; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s \ +; RUN: | %llc_dwarf - -O2 --global-isel=1 -mtriple=aarch64--linux-gnu -filetype=obj -dwarf-linkage-names=Abstract \ +; RUN: | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: not %python %p/../Inputs/check-fake-use.py %t + +; Generated with: +; clang -O2 -g -S -emit-llvm -fextend-this-ptr fake-use.c +; +; int glob[10]; +; extern void bar(); +; +; int foo(int b, int i) +; { +; int loc = glob[i] * 2; +; if (b) { +; glob[2] = loc; +; bar(); +; } +; return loc; +; } +; +; ModuleID = 't2.c' +source_filename = "t2.c" + +@glob = common local_unnamed_addr global [10 x i32] zeroinitializer, align 16, !dbg !0 + +; Function Attrs: nounwind sspstrong uwtable +define i32 @foo(i32 %b, i32 %i) local_unnamed_addr optdebug !dbg !13 { +entry: + #dbg_value(i32 %b, !17, !20, !21) + %c = add i32 %b, 42 + %tobool = icmp sgt i32 %c, 2, !dbg !27 + tail call void (...) @bar() #2, !dbg !32 + %idxprom = sext i32 %i to i64, !dbg !22 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @glob, i64 0, i64 %idxprom, !dbg !22 + %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !23 + %mul = shl nsw i32 %0, 1, !dbg !22 + br i1 %tobool, label %if.end, label %if.then, !dbg !29 + +if.then: ; preds = %entry + store i32 %mul, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @glob, i64 0, i64 2), align 8, !dbg !30, !tbaa !23 + tail call void (...) @bar() #2, !dbg !32 + br label %if.end, !dbg !33 + +if.end: ; preds = %entry, %if.then + call void (...) @llvm.fake.use(i32 %b), !dbg !34 + ret i32 %mul, !dbg !35 +} + +declare void @bar(...) local_unnamed_addr + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!9, !10, !11} +!llvm.ident = !{!12} + +!0 = distinct !DIGlobalVariableExpression(var: !DIGlobalVariable(name: "glob", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true), expr: !DIExpression()) +!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4) +!2 = !DIFile(filename: "t2.c", directory: "/") +!3 = !{} +!4 = !{!0} +!5 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 320, align: 32, elements: !7) +!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!7 = !{!8} +!8 = !DISubrange(count: 10) +!9 = !{i32 2, !"Dwarf Version", i32 4} +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{i32 1, !"PIC Level", i32 2} +!12 = !{!"clang version 4.0.0"} +!13 = distinct !DISubprogram(name: "foo", scope: !2, file: !2, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !1, retainedNodes: !16) +!14 = !DISubroutineType(types: !15) +!15 = !{!6, !6, !6} +!16 = !{!17, !19} +!17 = !DILocalVariable(name: "b", arg: 1, scope: !13, file: !2, line: 4, type: !6) +!19 = !DILocalVariable(name: "loc", scope: !13, file: !2, line: 6, type: !6) +!20 = !DIExpression() +!21 = !DILocation(line: 4, scope: !13) +!22 = !DILocation(line: 6, scope: !13) +!23 = !{!24, !24, i64 0} +!24 = !{!"int", !25, i64 0} +!25 = !{!"omnipotent char", !26, i64 0} +!26 = !{!"Simple C/C++ TBAA"} +!27 = !DILocation(line: 7, scope: !28) +!28 = distinct !DILexicalBlock(scope: !13, file: !2, line: 7) +!29 = !DILocation(line: 7, scope: !13) +!30 = !DILocation(line: 8, scope: !31) +!31 = distinct !DILexicalBlock(scope: !28, file: !2, line: 7) +!32 = !DILocation(line: 9, scope: !31) +!33 = !DILocation(line: 10, scope: !31) +!34 = !DILocation(line: 12, scope: !13) +!35 = !DILocation(line: 11, scope: !13) diff --git a/llvm/test/DebugInfo/Inputs/check-fake-use.py b/llvm/test/DebugInfo/Inputs/check-fake-use.py new file mode 100644 index 00000000000000..7797e102419b24 --- /dev/null +++ b/llvm/test/DebugInfo/Inputs/check-fake-use.py @@ -0,0 +1,107 @@ +#!/usr/bin/python3 + +# Parsing dwarfdump's output to determine whether the location list for the +# parameter "b" covers all of the function. The script searches for information +# in the input file to determine the [prologue, epilogue) range for the +# function, the location list range for "b", and checks that the latter covers +# the entirety of the former. +import re +import sys + +DebugInfoPattern = r"\.debug_info contents:" +DebugLinePattern = r"\.debug_line contents:" +ProloguePattern = r"^\s*0x([0-9a-f]+)\s.+prologue_end" +EpiloguePattern = r"^\s*0x([0-9a-f]+)\s.+epilogue_begin" +FormalPattern = r"^0x[0-9a-f]+:\s+DW_TAG_formal_parameter" +LocationPattern = r"DW_AT_location\s+\[DW_FORM_([a-z_]+)\](?:.*0x([a-f0-9]+))" +DebugLocPattern = r'\[0x([a-f0-9]+),\s+0x([a-f0-9]+)\) ".text": (.+)$' + +SeenDebugInfo = False +SeenDebugLine = False +LocationRanges = None +PrologueEnd = None +EpilogueBegin = None + +# The dwarfdump output should contain the DW_AT_location for "b" first, then the +# line table which should contain prologue_end and epilogue_begin entries. +with open(sys.argv[1], "r") as dwarf_dump_file: + dwarf_iter = iter(dwarf_dump_file) + for line in dwarf_iter: + if not SeenDebugInfo and re.match(DebugInfoPattern, line): + SeenDebugInfo = True + if not SeenDebugLine and re.match(DebugLinePattern, line): + SeenDebugLine = True + # Get the range of DW_AT_location for "b". + if LocationRanges is None: + if match := re.match(FormalPattern, line): + # Go until we either find DW_AT_location or reach the end of this entry. + location_match = None + while location_match is None: + if (line := next(dwarf_iter, "")) == "\n": + raise RuntimeError( + ".debug_info output is missing DW_AT_location for 'b'" + ) + location_match = re.search(LocationPattern, line) + # Variable has whole-scope location, represented by an empty tuple. + if location_match.group(1) == "exprloc": + LocationRanges = () + continue + if location_match.group(1) != "sec_offset": + raise RuntimeError( + f"Unhandled form for DW_AT_location: DW_FORM_{location_match.group(1)}" + ) + # Variable has location range list. + if ( + debug_loc_match := re.search(DebugLocPattern, next(dwarf_iter, "")) + ) is None: + raise RuntimeError(f"Invalid location range list for 'b'") + LocationRanges = ( + int(debug_loc_match.group(1), 16), + int(debug_loc_match.group(2), 16), + ) + while ( + debug_loc_match := re.search(DebugLocPattern, next(dwarf_iter, "")) + ) is not None: + match_loc_start = int(debug_loc_match.group(1), 16) + match_loc_end = int(debug_loc_match.group(2), 16) + match_expr = debug_loc_match.group(3) + if match_loc_start != LocationRanges[1]: + raise RuntimeError( + f"Location list for 'b' is discontinuous from [0x{LocationRanges[1]:x}, 0x{match_loc_start:x})" + ) + if "stack_value" in match_expr: + raise RuntimeError( + f"Location list for 'b' contains a stack_value expression: {match_expr}" + ) + LocationRanges = (LocationRanges[0], match_loc_end) + # Get the prologue_end address. + elif PrologueEnd is None: + if match := re.match(ProloguePattern, line): + PrologueEnd = int(match.group(1), 16) + # Get the epilogue_begin address. + elif EpilogueBegin is None: + if match := re.match(EpiloguePattern, line): + EpilogueBegin = int(match.group(1), 16) + break + +if not SeenDebugInfo: + raise RuntimeError(".debug_info section not found.") +if not SeenDebugLine: + raise RuntimeError(".debug_line section not found.") + +if LocationRanges is None: + raise RuntimeError(".debug_info output is missing parameter 'b'") +if PrologueEnd is None: + raise RuntimeError(".debug_line output is missing prologue_end") +if EpilogueBegin is None: + raise RuntimeError(".debug_line output is missing epilogue_begin") + +if len(LocationRanges) == 2 and ( + LocationRanges[0] > PrologueEnd or LocationRanges[1] < EpilogueBegin +): + raise RuntimeError( + f"""Location list for 'b' does not cover the whole function:") + Prologue to Epilogue = [0x{PrologueEnd:x}, 0x{EpilogueBegin:x}) + Location range = [0x{LocationRanges[0]:x}, 0x{LocationRanges[1]:x}) +""" + ) diff --git a/llvm/test/DebugInfo/X86/fake-use.ll b/llvm/test/DebugInfo/X86/fake-use.ll new file mode 100644 index 00000000000000..f44aadfeef5640 --- /dev/null +++ b/llvm/test/DebugInfo/X86/fake-use.ll @@ -0,0 +1,96 @@ +; REQUIRES: object-emission + +; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b' +; to extend all the way to the end of the function. + +; RUN: %llc_dwarf -O2 -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: %python %p/../Inputs/check-fake-use.py %t +; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: not %python %p/../Inputs/check-fake-use.py %t + +; Generated with: +; clang -O2 -g -S -emit-llvm -fextend-this-ptr fake-use.c +; +; int glob[10]; +; extern void bar(); +; +; int foo(int b, int i) +; { +; int loc = glob[i] * 2; +; if (b) { +; glob[2] = loc; +; bar(); +; } +; return loc; +; } +; +; ModuleID = 't2.c' +source_filename = "t2.c" + +@glob = common local_unnamed_addr global [10 x i32] zeroinitializer, align 16, !dbg !0 + +; Function Attrs: nounwind sspstrong uwtable +define i32 @foo(i32 %b, i32 %i) local_unnamed_addr optdebug !dbg !13 { +entry: + #dbg_value(i32 %b, !17, !20, !21) + %c = add i32 %b, 42 + %tobool = icmp sgt i32 %c, 2, !dbg !27 + tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() + tail call void (...) @bar() #2, !dbg !32 + %idxprom = sext i32 %i to i64, !dbg !22 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @glob, i64 0, i64 %idxprom, !dbg !22 + %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !23 + %mul = shl nsw i32 %0, 1, !dbg !22 + br i1 %tobool, label %if.end, label %if.then, !dbg !29 + +if.then: ; preds = %entry + store i32 %mul, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @glob, i64 0, i64 2), align 8, !dbg !30, !tbaa !23 + tail call void (...) @bar() #2, !dbg !32 + br label %if.end, !dbg !33 + +if.end: ; preds = %entry, %if.then + call void (...) @llvm.fake.use(i32 %b), !dbg !34 + ret i32 %mul, !dbg !35 +} + +declare void @bar(...) local_unnamed_addr + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!9, !10, !11} +!llvm.ident = !{!12} + +!0 = distinct !DIGlobalVariableExpression(var: !DIGlobalVariable(name: "glob", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true), expr: !DIExpression()) +!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4) +!2 = !DIFile(filename: "t2.c", directory: "/") +!3 = !{} +!4 = !{!0} +!5 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 320, align: 32, elements: !7) +!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!7 = !{!8} +!8 = !DISubrange(count: 10) +!9 = !{i32 2, !"Dwarf Version", i32 4} +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{i32 1, !"PIC Level", i32 2} +!12 = !{!"clang version 4.0.0"} +!13 = distinct !DISubprogram(name: "foo", scope: !2, file: !2, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !1, retainedNodes: !16) +!14 = !DISubroutineType(types: !15) +!15 = !{!6, !6, !6} +!16 = !{!17, !19} +!17 = !DILocalVariable(name: "b", arg: 1, scope: !13, file: !2, line: 4, type: !6) +!19 = !DILocalVariable(name: "loc", scope: !13, file: !2, line: 6, type: !6) +!20 = !DIExpression() +!21 = !DILocation(line: 4, scope: !13) +!22 = !DILocation(line: 6, scope: !13) +!23 = !{!24, !24, i64 0} +!24 = !{!"int", !25, i64 0} +!25 = !{!"omnipotent char", !26, i64 0} +!26 = !{!"Simple C/C++ TBAA"} +!27 = !DILocation(line: 7, scope: !28) +!28 = distinct !DILexicalBlock(scope: !13, file: !2, line: 7) +!29 = !DILocation(line: 7, scope: !13) +!30 = !DILocation(line: 8, scope: !31) +!31 = distinct !DILexicalBlock(scope: !28, file: !2, line: 7) +!32 = !DILocation(line: 9, scope: !31) +!33 = !DILocation(line: 10, scope: !31) +!34 = !DILocation(line: 12, scope: !13) +!35 = !DILocation(line: 11, scope: !13) diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index b52849b6bc931d..00601b7ae6e0d2 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -136,14 +136,14 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { // CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 6*/ GIMT_Encode4([[#DEFAULT:]]), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(470), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(506), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(553), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(587), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(610), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(622), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(474), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(510), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(591), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(614), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(626), // CHECK-NEXT: // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]] -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(494), // Rule ID 4 // +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(498), // Rule ID 4 // // CHECK-NEXT: GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything), // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled), // CHECK-NEXT: // MIs[0] a @@ -156,8 +156,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1, // CHECK-NEXT: // Combiner Rule #3: InstTest1 // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2), -// CHECK-NEXT: // Label 7: @494 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(505), // Rule ID 3 // +// CHECK-NEXT: // Label 7: @498 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(509), // Rule ID 3 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // No operand predicates // CHECK-NEXT: // Combiner Rule #2: InstTest0 // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1), -// CHECK-NEXT: // Label 8: @505 +// CHECK-NEXT: // Label 8: @509 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @506 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(552), // Rule ID 6 // +// CHECK-NEXT: // Label 1: @510 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled), // CHECK-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] dst @@ -185,10 +185,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // dst // CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z // CHECK-NEXT: GIR_EraseRootFromParent_Done, -// CHECK-NEXT: // Label 9: @552 +// CHECK-NEXT: // Label 9: @556 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @553 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(586), // Rule ID 5 // +// CHECK-NEXT: // Label 2: @557 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(590), // Rule ID 5 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled), // CHECK-NEXT: // MIs[0] tmp // CHECK-NEXT: GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1] @@ -204,29 +204,29 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // ptr // CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1, // CHECK-NEXT: GIR_EraseRootFromParent_Done, -// CHECK-NEXT: // Label 10: @586 +// CHECK-NEXT: // Label 10: @590 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @587 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(598), // Rule ID 0 // +// CHECK-NEXT: // Label 3: @591 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(602), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0), -// CHECK-NEXT: // Label 11: @598 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(609), // Rule ID 1 // +// CHECK-NEXT: // Label 11: @602 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(613), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0), -// CHECK-NEXT: // Label 12: @609 +// CHECK-NEXT: // Label 12: @613 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 4: @610 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(621), // Rule ID 2 // +// CHECK-NEXT: // Label 4: @614 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(625), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT' // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0), -// CHECK-NEXT: // Label 13: @621 +// CHECK-NEXT: // Label 13: @625 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 5: @622 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(656), // Rule ID 7 // +// CHECK-NEXT: // Label 5: @626 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled), // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: // No operand predicates @@ -240,7 +240,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // dst // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseRootFromParent_Done, -// CHECK-NEXT: // Label 14: @656 +// CHECK-NEXT: // Label 14: @660 // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: // Label 6: @[[#%u, DEFAULT]] // CHECK-NEXT: GIM_Reject, diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll new file mode 100644 index 00000000000000..064d3f29dd9ebb --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-phi.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -passes='require,function(codegenprepare)' -S -mtriple=x86_64 | FileCheck %s --implicit-check-not="llvm.fake.use" +; +; When performing return duplication to enable +; tail call optimization we clone fake uses that exist in the to-be-eliminated +; return block into the predecessor blocks. When doing this with fake uses +; of PHI-nodes, they cannot be easily copied, but require the correct operand. +; We are currently not able to do this correctly, so we suppress the cloning +; of such fake uses at the moment. +; +; There should be no fake use of a call result in any of the resulting return +; blocks. + +; Fake uses of `this` should be duplicated into both return blocks. +; CHECK: if.then: +; CHECK: @llvm.fake.use({{.*}}this +; CHECK: if.else: +; CHECK: @llvm.fake.use({{.*}}this + +; CHECK: declare void @llvm.fake.use + +source_filename = "test.ll" + +%class.a = type { i8 } + +declare i32 @foo(ptr nonnull dereferenceable(1)) local_unnamed_addr +declare i32 @bar(ptr nonnull dereferenceable(1)) local_unnamed_addr + +define hidden void @func(ptr nonnull dereferenceable(1) %this) local_unnamed_addr align 2 optdebug { +entry: + %b = getelementptr inbounds %class.a, ptr %this, i64 0, i32 0 + %0 = load i8, i8* %b, align 1 + %tobool.not = icmp eq i8 %0, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %call = tail call i32 @foo(ptr nonnull dereferenceable(1) %this) + %call2 = tail call i32 @bar(ptr nonnull dereferenceable(1) %this) + br label %if.end + +if.else: ; preds = %entry + %call4 = tail call i32 @bar(ptr nonnull dereferenceable(1) %this) + %call5 = tail call i32 @foo(ptr nonnull dereferenceable(1) %this) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %call4.sink = phi i32 [ %call4, %if.else ], [ %call, %if.then ] + notail call void (...) @llvm.fake.use(i32 %call4.sink) + notail call void (...) @llvm.fake.use(ptr nonnull %this) + ret void +} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll new file mode 100644 index 00000000000000..b2cf89f6f2dd82 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/fake-use-split-ret.ll @@ -0,0 +1,37 @@ +; RUN: opt -mtriple=x86_64-unknown-unknown -S -codegenprepare <%s -o - | FileCheck %s +; +; Ensure return instruction splitting ignores fake uses. +; +; IR Generated with clang -O2 -S -emit-llvm -fextend-lifetimes test.cpp +; +;// test.cpp +;extern int bar(int); +; +;int foo2(int i) +;{ +; --i; +; if (i <= 0) +; return -1; +; return bar(i); +;} + +declare i32 @_Z3bari(i32) local_unnamed_addr + +define i32 @_Z4foo2i(i32 %i) local_unnamed_addr optdebug { +entry: + %dec = add nsw i32 %i, -1 + %cmp = icmp slt i32 %i, 2 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + %call = tail call i32 @_Z3bari(i32 %dec) +; CHECK: ret i32 %call + br label %cleanup + +cleanup: ; preds = %entry, %if.end +; CHECK: cleanup: + %retval.0 = phi i32 [ %call, %if.end ], [ -1, %entry ] + tail call void (...) @llvm.fake.use(i32 %dec) +; CHECK: ret i32 -1 + ret i32 %retval.0 +} diff --git a/llvm/test/Transforms/GVN/fake-use-constprop.ll b/llvm/test/Transforms/GVN/fake-use-constprop.ll new file mode 100644 index 00000000000000..1466f9f9fca277 --- /dev/null +++ b/llvm/test/Transforms/GVN/fake-use-constprop.ll @@ -0,0 +1,60 @@ +; RUN: opt -passes=gvn -S < %s | FileCheck %s +; +; The Global Value Numbering pass (GVN) propagates boolean values +; that are constant in dominated basic blocks to all the uses +; in these basic blocks. However, we don't want the constant propagated +; into fake.use intrinsics since this would render the intrinsic useless +; with respect to keeping the variable live up until the fake.use. +; This test checks that we don't generate any fake.uses with constant 0. +; +; Reduced from the following test case, generated with clang -O2 -S -emit-llvm -fextend-lifetimes test.c +; +; extern void func1(); +; extern int bar(); +; extern void baz(int); +; +; int foo(int i, float f, int *punused) +; { +; int j = 3*i; +; if (j > 0) { +; int m = bar(i); +; if (m) { +; char b = f; +; baz(b); +; if (b) +; goto lab; +; func1(); +; } +; lab: +; func1(); +; } +; return 1; +; } + +;; GVN should propagate a constant value through to a regular call, but not to +;; a fake use, which should continue to track the original value. +; CHECK: %[[CONV_VAR:[a-zA-Z0-9]+]] = fptosi +; CHECK: call {{.+}} @bees(i8 0) +; CHECK: call {{.+}} @llvm.fake.use(i8 %[[CONV_VAR]]) + +define i32 @foo(float %f) optdebug { + %conv = fptosi float %f to i8 + %tobool3 = icmp eq i8 %conv, 0 + br i1 %tobool3, label %if.end, label %lab + +if.end: + tail call void (...) @bees(i8 %conv) + tail call void (...) @llvm.fake.use(i8 %conv) + br label %lab + +lab: + ret i32 1 +} + +declare i32 @bar(...) + +declare void @baz(i32) + +declare void @bees(i32) + +declare void @func1(...) diff --git a/llvm/test/Transforms/SROA/fake-use-escape.ll b/llvm/test/Transforms/SROA/fake-use-escape.ll new file mode 100644 index 00000000000000..5429d09740e522 --- /dev/null +++ b/llvm/test/Transforms/SROA/fake-use-escape.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -passes=sroa %s | FileCheck %s +; +;; Check that we do not assert and that we retain the fake_use instruction that +;; uses the address of bar. +; +; CHECK: define{{.*}}foo +; CHECK: call{{.*llvm\.fake\.use.*}}(ptr %bar.addr) + +define void @_Z3fooPi(ptr %bar) { +entry: + %bar.addr = alloca ptr, align 8 + %baz = alloca ptr, align 8 + store ptr %bar, ptr %bar.addr, align 8 + store ptr %bar.addr, ptr %baz, align 8 + %0 = load ptr, ptr %bar.addr, align 8 + %1 = load ptr, ptr %baz, align 8 + call void (...) @llvm.fake.use(ptr %1) + ret void +} + +declare void @llvm.fake.use(...) diff --git a/llvm/test/Transforms/SROA/fake-use-sroa.ll b/llvm/test/Transforms/SROA/fake-use-sroa.ll new file mode 100644 index 00000000000000..9e92df15487506 --- /dev/null +++ b/llvm/test/Transforms/SROA/fake-use-sroa.ll @@ -0,0 +1,52 @@ +; RUN: opt -S -passes=sroa %s | FileCheck %s +; With fake use instrinsics generated for small aggregates, check that when +; SROA slices the aggregate, we generate individual fake use intrinsics for +; the individual values. + +; Generated from the following source: +; struct s { +; int i; +; int j; +; }; +; +; void foo(struct s S) { +; } +; +; void bar() { +; int arr[2] = {5, 6}; +; } +; +%struct.s = type { i32, i32 } +@__const.bar.arr = private unnamed_addr constant [2 x i32] [i32 5, i32 6], align 4 + +; A small struct passed as parameter +; CHECK-LABEL: define{{.*}}foo +; CHECK: %[[SLICE1:[^ ]+]] = trunc i64 +; CHECK: %[[SLICE2:[^ ]+]] = trunc i64 +; CHECK-DAG: call{{.*}} @llvm.fake.use(i32 %[[SLICE1]]) +; CHECK-DAG: call{{.*}} @llvm.fake.use(i32 %[[SLICE2]]) +define dso_local void @foo(i64 %S.coerce) optdebug { +entry: + %S = alloca %struct.s, align 4 + store i64 %S.coerce, ptr %S, align 4 + %fake.use = load %struct.s, ptr %S, align 4 + notail call void (...) @llvm.fake.use(%struct.s %fake.use) + ret void +} + +; A local variable with a small array type. +; CHECK-LABEL: define{{.*}}bar +; CHECK: %[[ARRAYSLICE1:[^ ]+]] = load +; CHECK: %[[ARRAYSLICE2:[^ ]+]] = load +; CHECK-DAG: call{{.*}} @llvm.fake.use(i32 %[[ARRAYSLICE1]]) +; CHECK-DAG: call{{.*}} @llvm.fake.use(i32 %[[ARRAYSLICE2]]) +define dso_local void @bar() optdebug { +entry: + %arr = alloca [2 x i32], align 4 + call void @llvm.memcpy.p0i8.p0i8.i64(ptr align 4 %arr, ptr align 4 bitcast (ptr @__const.bar.arr to ptr), i64 8, i1 false) + %fake.use = load [2 x i32], ptr %arr, align 4 + notail call void (...) @llvm.fake.use([2 x i32] %fake.use) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index bbd1f9af65095a..3a4c7ea03b3aa9 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -201,6 +201,7 @@ static_library("CodeGen") { "RegisterPressure.cpp", "RegisterScavenging.cpp", "RegisterUsageInfo.cpp", + "RemoveLoadsIntoFakeUses.cpp", "RemoveRedundantDebugValues.cpp", "RenameIndependentSubregs.cpp", "ReplaceWithVeclib.cpp", From 74b4ec17e24a256b4aae5e53b855ba429af685bf Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 09:57:58 -0700 Subject: [PATCH 19/72] [VP] Remove VP_PROPERTY_REDUCTION and VP_PROPERTY_CMP [nfc] (#105551) These lists are quite static and several of the parameters are actually constant across all users. Heavy use of macros is undesirable, and not idiomatic in LLVM, so let's just use the naive switch cases. I'll probably continue with removing the other property macros. These two just happened to be the two I actually had to figure out for an unrelated change. --- llvm/include/llvm/IR/VPIntrinsics.def | 19 ----- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 25 +++++-- llvm/lib/IR/IntrinsicInst.cpp | 73 ++++++++----------- 3 files changed, 48 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 521cbc2dc278f9..3fad00e2caf21f 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -129,11 +129,6 @@ #define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) #endif -// Map this VP reduction intrinsic to its reduction operand positions. -#ifndef VP_PROPERTY_REDUCTION -#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) -#endif - // A property to infer VP binary-op SDNode opcodes automatically. #ifndef VP_PROPERTY_BINARYOP #define VP_PROPERTY_BINARYOP @@ -144,13 +139,6 @@ #define VP_PROPERTY_CASTOP #endif -// This VP Intrinsic is a comparison operation -// The condition code arg is at CCPOS and accepts floating-point condition -// codes if ISFP is set, else it accepts integer condition codes. -#ifndef VP_PROPERTY_CMP -#define VP_PROPERTY_CMP(CCPOS, ISFP) -#endif - /// } Property Macros ///// Integer Arithmetic { @@ -567,7 +555,6 @@ END_REGISTER_VP_SDNODE(VP_SETCC) BEGIN_REGISTER_VP_INTRINSIC(vp_fcmp, 3, 4) HELPER_MAP_VPID_TO_VPSD(vp_fcmp, VP_SETCC) VP_PROPERTY_FUNCTIONAL_OPC(FCmp) -VP_PROPERTY_CMP(2, true) VP_PROPERTY_CONSTRAINEDFP(0, 1, experimental_constrained_fcmp) END_REGISTER_VP_INTRINSIC(vp_fcmp) @@ -575,7 +562,6 @@ END_REGISTER_VP_INTRINSIC(vp_fcmp) BEGIN_REGISTER_VP_INTRINSIC(vp_icmp, 3, 4) HELPER_MAP_VPID_TO_VPSD(vp_icmp, VP_SETCC) VP_PROPERTY_FUNCTIONAL_OPC(ICmp) -VP_PROPERTY_CMP(2, false) END_REGISTER_VP_INTRINSIC(vp_icmp) ///// } Comparisons @@ -655,7 +641,6 @@ END_REGISTER_VP(vp_gather, VP_GATHER) BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, 1) \ VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ - VP_PROPERTY_REDUCTION(0, 1) \ END_REGISTER_VP(VPID, VPSD) // llvm.vp.reduce.add(start,x,mask,vlen) @@ -725,13 +710,11 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM, #define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, SDOPC, SEQ_SDOPC, INTRIN) \ BEGIN_REGISTER_VP_INTRINSIC(VPID, 2, 3) \ BEGIN_REGISTER_VP_SDNODE(VPSD, 1, VPID, 2, 3) \ - VP_PROPERTY_REDUCTION(0, 1) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ END_REGISTER_VP_SDNODE(VPSD) \ BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, 1, VPID, 2, 3) \ HELPER_MAP_VPID_TO_VPSD(VPID, SEQ_VPSD) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SEQ_SDOPC) \ - VP_PROPERTY_REDUCTION(0, 1) \ END_REGISTER_VP_SDNODE(SEQ_VPSD) \ VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \ END_REGISTER_VP_INTRINSIC(VPID) @@ -796,11 +779,9 @@ END_REGISTER_VP(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT) #undef HELPER_MAP_VPID_TO_VPSD #undef VP_PROPERTY_BINARYOP #undef VP_PROPERTY_CASTOP -#undef VP_PROPERTY_CMP #undef VP_PROPERTY_CONSTRAINEDFP #undef VP_PROPERTY_FUNCTIONAL_INTRINSIC #undef VP_PROPERTY_FUNCTIONAL_OPC #undef VP_PROPERTY_FUNCTIONAL_SDOPC #undef VP_PROPERTY_NO_FUNCTIONAL #undef VP_PROPERTY_MEMOP -#undef VP_PROPERTY_REDUCTION diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9c7a43064ecf62..9efcd3f25797b5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -503,13 +503,26 @@ bool ISD::isVPBinaryOp(unsigned Opcode) { bool ISD::isVPReduction(unsigned Opcode) { switch (Opcode) { default: - break; -#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD: -#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true; -#define END_REGISTER_VP_SDNODE(VPSD) break; -#include "llvm/IR/VPIntrinsics.def" + return false; + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_FMAX: + case ISD::VP_REDUCE_FMIN: + case ISD::VP_REDUCE_FMAXIMUM: + case ISD::VP_REDUCE_FMINIMUM: + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_SEQ_FMUL: + return true; } - return false; } /// The operand position of the vector mask. diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 7680fd02562316..966fa62abd94fe 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -738,14 +738,25 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) { switch (ID) { + case Intrinsic::vp_reduce_add: + case Intrinsic::vp_reduce_mul: + case Intrinsic::vp_reduce_and: + case Intrinsic::vp_reduce_or: + case Intrinsic::vp_reduce_xor: + case Intrinsic::vp_reduce_smax: + case Intrinsic::vp_reduce_smin: + case Intrinsic::vp_reduce_umax: + case Intrinsic::vp_reduce_umin: + case Intrinsic::vp_reduce_fmax: + case Intrinsic::vp_reduce_fmin: + case Intrinsic::vp_reduce_fmaximum: + case Intrinsic::vp_reduce_fminimum: + case Intrinsic::vp_reduce_fadd: + case Intrinsic::vp_reduce_fmul: + return true; default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" + return false; } - return false; } bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) { @@ -763,13 +774,11 @@ bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) { bool VPCmpIntrinsic::isVPCmp(Intrinsic::ID ID) { switch (ID) { default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_CMP(CCPOS, ...) return true; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" + return false; + case Intrinsic::vp_fcmp: + case Intrinsic::vp_icmp: + return true; } - return false; } bool VPBinOpIntrinsic::isVPBinOp(Intrinsic::ID ID) { @@ -803,22 +812,10 @@ static ICmpInst::Predicate getIntPredicateFromMD(const Value *Op) { } CmpInst::Predicate VPCmpIntrinsic::getPredicate() const { - bool IsFP = true; - std::optional CCArgIdx; - switch (getIntrinsicID()) { - default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_CMP(CCPOS, ISFP) \ - CCArgIdx = CCPOS; \ - IsFP = ISFP; \ - break; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" - } - assert(CCArgIdx && "Unexpected vector-predicated comparison"); - return IsFP ? getFPPredicateFromMD(getArgOperand(*CCArgIdx)) - : getIntPredicateFromMD(getArgOperand(*CCArgIdx)); + assert(isVPCmp(getIntrinsicID())); + return getIntrinsicID() == Intrinsic::vp_fcmp + ? getFPPredicateFromMD(getArgOperand(2)) + : getIntPredicateFromMD(getArgOperand(2)); } unsigned VPReductionIntrinsic::getVectorParamPos() const { @@ -831,27 +828,15 @@ unsigned VPReductionIntrinsic::getStartParamPos() const { std::optional VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) { - switch (ID) { -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return VECTORPOS; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" - default: - break; - } + if (isVPReduction(ID)) + return 1; return std::nullopt; } std::optional VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) { - switch (ID) { -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return STARTPOS; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" - default: - break; - } + if (isVPReduction(ID)) + return 0; return std::nullopt; } From eed135fea72b544426349e6461a0ca142c27967e Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Thu, 29 Aug 2024 17:56:49 +0100 Subject: [PATCH 20/72] Revert "[Analysis] Guard logf128 cst folding" This reverts commit 42d3cccffd203ff6dc967d4243588ca466c0faf7 which caused a test failure. --- llvm/lib/Analysis/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 99ce0acdabe1ba..393803fad89383 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -163,6 +163,8 @@ add_llvm_component_library(LLVMAnalysis TargetParser ) -if(LLVM_HAS_LOGF128) - target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128) +include(CheckCXXSymbolExists) +check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) +if(HAS_LOGF128) + target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128) endif() From 178392454e076624674b4a7ddf3fc8bda2e94f0e Mon Sep 17 00:00:00 2001 From: Harini0924 Date: Thu, 29 Aug 2024 10:00:58 -0700 Subject: [PATCH 21/72] [llvm-lit] Print environment variables when using env without subcommand (#98414) This patch addresses an issue with lit's internal shell when env is without any arguments, it fails with exit code 127 because `env` requires a subcommand. This patch addresses the issue by encoding the command to properly return environment variables even when no arguments are provided. The error occurred when running the command ` LIT_USE_INTERNAL_SHELL=1 ninja check-llvm`. fixes: #102383 This is part of the test cleanups proposed in the RFC: [[RFC] Enabling the Lit Internal Shell by Default](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179) --- llvm/utils/lit/lit/TestRunner.py | 11 +- .../env-calls-cd.txt | 0 .../env-calls-colon.txt | 0 .../env-calls-echo.txt | 0 .../env-calls-export.txt | 0 .../env-calls-mkdir.txt | 0 .../env-calls-not-builtin.txt | 0 .../env-calls-rm.txt | 0 .../lit.cfg | 1 + .../env-args-last-is-assign.txt | 0 .../env-args-last-is-u-arg.txt | 0 .../env-args-last-is-u.txt | 0 .../env-args-nested-none.txt | 0 .../shtest-env-positive/env-calls-env.txt | 32 +++++ .../shtest-env-positive/env-no-subcommand.txt | 37 +++++ .../Inputs/shtest-env-positive/env-u.txt | 22 +++ .../tests/Inputs/shtest-env-positive/env.txt | 15 +++ .../tests/Inputs/shtest-env-positive/lit.cfg | 11 ++ .../Inputs/shtest-env-positive/mixed.txt | 18 +++ .../tests/Inputs/shtest-env/env-args-none.txt | 1 - .../tests/Inputs/shtest-env/env-calls-env.txt | 32 ----- .../lit/tests/Inputs/shtest-env/env-u.txt | 23 ---- .../utils/lit/tests/Inputs/shtest-env/env.txt | 15 --- .../lit/tests/Inputs/shtest-env/mixed.txt | 18 --- .../Inputs/shtest-env/print_environment.py | 9 -- llvm/utils/lit/tests/shtest-env-negative.py | 49 +++++++ llvm/utils/lit/tests/shtest-env-positive.py | 70 ++++++++++ llvm/utils/lit/tests/shtest-env.py | 126 ------------------ 28 files changed, 265 insertions(+), 225 deletions(-) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-cd.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-colon.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-echo.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-export.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-mkdir.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-not-builtin.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/env-calls-rm.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-negative}/lit.cfg (90%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-assign.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-u-arg.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-last-is-u.txt (100%) rename llvm/utils/lit/tests/Inputs/{shtest-env => shtest-env-positive}/env-args-nested-none.txt (100%) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/env.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py create mode 100644 llvm/utils/lit/tests/shtest-env-negative.py create mode 100644 llvm/utils/lit/tests/shtest-env-positive.py delete mode 100644 llvm/utils/lit/tests/shtest-env.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 223a6c6e4ca0a2..19f35fc7e212f3 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -742,7 +742,16 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): cmd_shenv = ShellEnvironment(shenv.cwd, shenv.env) args = updateEnv(cmd_shenv, args) if not args: - raise InternalShellError(j, "Error: 'env' requires a" " subcommand") + # Return the environment variables if no argument is provided. + env_str = "\n".join( + f"{key}={value}" for key, value in sorted(cmd_shenv.env.items()) + ) + results.append( + ShellCommandResult( + j, env_str, "", 0, timeoutHelper.timeoutReached(), [] + ) + ) + return 0 elif args[0] == "not": not_args.append(args.pop(0)) not_count += 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-cd.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-cd.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-cd.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-cd.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-colon.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-colon.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-colon.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-colon.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-echo.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-echo.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-echo.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-export.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-export.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-export.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-export.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-mkdir.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-mkdir.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-mkdir.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-not-builtin.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-not-builtin.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-not-builtin.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-not-builtin.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-rm.txt b/llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-rm.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-calls-rm.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/env-calls-rm.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg similarity index 90% rename from llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg rename to llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg index df9df7da81daaf..626c00f71d7287 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-env/lit.cfg +++ b/llvm/utils/lit/tests/Inputs/shtest-env-negative/lit.cfg @@ -7,4 +7,5 @@ config.test_source_root = None config.test_exec_root = None config.environment["FOO"] = "1" config.environment["BAR"] = "2" +config.environment["QUX"] = "3" config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-assign.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-assign.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-assign.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-assign.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u-arg.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u-arg.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u-arg.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u-arg.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-last-is-u.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-last-is-u.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-nested-none.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-nested-none.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-env/env-args-nested-none.txt rename to llvm/utils/lit/tests/Inputs/shtest-env-positive/env-args-nested-none.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt new file mode 100644 index 00000000000000..ee40c60a1e4b65 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-calls-env.txt @@ -0,0 +1,32 @@ +## Tests the behaviour of chaining env commands together. + +## Check that internal env can call internal env. +# RUN: env env | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s +# +# CHECK-2-EMPTY-ARGS: BAR = 2 +# CHECK-2-EMPTY-ARGS: FOO = 1 + +## Check setting variables in a nested env call. +# RUN: env FOO=2 env BAR=1 | FileCheck -check-prefix=CHECK-2-VAL %s +# +# CHECK-2-VAL: BAR = 1 +# CHECK-2-VAL: FOO = 2 + +## Check unsetting variables in a nested env call. +# RUN: env -u FOO env -u BAR | FileCheck -check-prefix=CHECK-2-U %s +# +# CHECK-2-U-NOT: BAR +# CHECK-2-U-NOT: FOO + +## Check mixed setting and unsetting in nested env calls. +# RUN: env -u FOO BAR=1 env -u BAR FOO=2 | FileCheck -check-prefix=CHECK-2-U-VAL %s +# +# CHECK-2-U-VAL-NOT: BAR +# CHECK-2-U-VAL: FOO = 2 + +## Check setting, unsetting, and adding a new variable in nested env calls. +# RUN: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 | FileCheck -check-prefix=CHECK-3 %s +# +# CHECK-3-NOT: BAR +# CHECK-3: BAZ = 3 +# CHECK-3: FOO = 2 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt new file mode 100644 index 00000000000000..761a8061a0b0de --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-no-subcommand.txt @@ -0,0 +1,37 @@ +## Tests the env command in various scenarios: without arguments, setting, unsetting, and mixing envrionment variables. + +## Check default environment. +# RUN: env | FileCheck -check-prefix=NO-ARGS %s +# +# NO-ARGS: BAR=2 +# NO-ARGS: FOO=1 +# NO-ARGS: QUX=3 + +## Set environment variables. +# RUN: env FOO=2 BAR=1 | FileCheck -check-prefix=SET-VAL %s +# +# SET-VAL: BAR=1 +# SET-VAL: FOO=2 +# SET-VAL: QUX=3 + +## Unset environment variables. +# RUN: env -u FOO -u BAR | FileCheck -check-prefix=UNSET-U %s +# +# UNSET-U-NOT: BAR +# UNSET-U-NOT: FOO +# UNSET-U: QUX=3 + +## Mixed set and unset environment variables. +# RUN: env -u FOO BAR=1 -u BAR FOO=2 | FileCheck -check-prefix=MIXED-SET-UNSET %s +# +# MIXED-SET-UNSET-NOT: BAR +# MIXED-SET-UNSET: FOO=2 +# MIXED-SET-UNSET: QUX=3 + +## Mixed set and unset with additional variable. +# RUN: env -u FOO BAR=1 -u BAR FOO=2 BAZ=4 | FileCheck -check-prefix=MIXED-SET-UNSET-ADD-3 %s +# +# MIXED-SET-UNSET-ADD-NOT: BAR +# MIXED-SET-UNSET-ADD: BAZ=4 +# MIXED-SET-UNSET-ADD: FOO=2 +# MIXED-SET-UNSET-ADD: QUX=3 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt new file mode 100644 index 00000000000000..2945639c0642df --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env-u.txt @@ -0,0 +1,22 @@ +## Tests env command for preset variables and handling single/multiple unsets. + +## Check and make sure preset environment variable were set in lit.cfg. +# +# RUN: env | FileCheck --check-prefix=CHECK-ENV-PRESET %s +# +## Check single unset of environment variable. +# +# RUN: env -u FOO | FileCheck --check-prefix=CHECK-ENV-UNSET-1 %s +# +## Check multiple unsets of environment variables. +# +# RUN: env -u FOO -u BAR | FileCheck --check-prefix=CHECK-ENV-UNSET-MULTIPLE %s + +# CHECK-ENV-PRESET: BAR = 2 +# CHECK-ENV-PRESET: FOO = 1 + +# CHECK-ENV-UNSET-1: BAR = 2 +# CHECK-ENV-UNSET-1-NOT: FOO + +# CHECK-ENV-UNSET-MULTIPLE-NOT: BAR +# CHECK-ENV-UNSET-MULTIPLE-NOT: FOO diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt new file mode 100644 index 00000000000000..74a2a65d260f41 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/env.txt @@ -0,0 +1,15 @@ +## Tests env command for setting single and multiple environment variables. + +## Check for simple one environment variable setting. +# +# RUN: env A_FOO=999 | FileCheck --check-prefix=CHECK-ENV-1 %s +# +## Check for multiple environment variable settings. +# +# RUN: env A_FOO=1 B_BAR=2 C_OOF=3 | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s + +# CHECK-ENV-1: A_FOO = 999 + +# CHECK-ENV-MULTIPLE: A_FOO = 1 +# CHECK-ENV-MULTIPLE: B_BAR = 2 +# CHECK-ENV-MULTIPLE: C_OOF = 3 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg new file mode 100644 index 00000000000000..626c00f71d7287 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/lit.cfg @@ -0,0 +1,11 @@ +import lit.formats + +config.name = "shtest-env" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None +config.environment["FOO"] = "1" +config.environment["BAR"] = "2" +config.environment["QUX"] = "3" +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt b/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt new file mode 100644 index 00000000000000..c2c4e8bfdfc8bd --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-positive/mixed.txt @@ -0,0 +1,18 @@ +## Tests env command for setting and unsetting single and multiple environment variables. + +## Check for setting and removing one environment variable +# +# RUN: env A_FOO=999 -u FOO | FileCheck --check-prefix=CHECK-ENV-1 %s +# +## Check for setting/unsetting multiple environment variables +# +# RUN: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s + +# CHECK-ENV-1: A_FOO = 999 +# CHECK-ENV-1-NOT: FOO + +# CHECK-ENV-MULTIPLE: A_FOO = 1 +# CHECK-ENV-MULTIPLE-NOT: BAR +# CHECK-ENV-MULTIPLE: B_BAR = 2 +# CHECK-ENV-MULTIPLE: C_OOF = 3 +# CHECK-ENV-MULTIPLE-NOT: FOO diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt deleted file mode 100644 index dc5cdbad09afc9..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/env-args-none.txt +++ /dev/null @@ -1 +0,0 @@ -# RUN: env diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt deleted file mode 100644 index 26150c413dc03d..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Check that internal env can call internal env. - -# RUN: env env %{python} print_environment.py \ -# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s -# -# CHECK-2-EMPTY-ARGS: BAR = 2 -# CHECK-2-EMPTY-ARGS: FOO = 1 - -# RUN: env FOO=2 env BAR=1 %{python} print_environment.py \ -# RUN: | FileCheck -check-prefix=CHECK-2-VAL %s -# -# CHECK-2-VAL: BAR = 1 -# CHECK-2-VAL: FOO = 2 - -# RUN: env -u FOO env -u BAR %{python} print_environment.py \ -# RUN: | FileCheck -check-prefix=CHECK-2-U %s -# -# CHECK-2-U-NOT: BAR -# CHECK-2-U-NOT: FOO - -# RUN: env -u FOO BAR=1 env -u BAR FOO=2 %{python} print_environment.py \ -# RUN: | FileCheck -check-prefix=CHECK-2-U-VAL %s -# -# CHECK-2-U-VAL-NOT: BAR -# CHECK-2-U-VAL: FOO = 2 - -# RUN: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 %{python} print_environment.py \ -# RUN: | FileCheck -check-prefix=CHECK-3 %s -# -# CHECK-3-NOT: BAR -# CHECK-3: BAZ = 3 -# CHECK-3: FOO = 2 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt deleted file mode 100644 index 9cdf9d08850f78..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/env-u.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Check and make sure preset environment variable were set in lit.cfg -# -# RUN: %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-PRESET %s -# -# Check single unset of environment variable -# -# RUN: env -u FOO %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-UNSET-1 %s -# -# Check multiple unsets of environment variables -# -# RUN: env -u FOO -u BAR %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-UNSET-MULTIPLE %s - -# CHECK-ENV-PRESET: BAR = 2 -# CHECK-ENV-PRESET: FOO = 1 - -# CHECK-ENV-UNSET-1: BAR = 2 -# CHECK-ENV-UNSET-1-NOT: FOO - -# CHECK-ENV-UNSET-MULTIPLE-NOT: BAR -# CHECK-ENV-UNSET-MULTIPLE-NOT: FOO diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env.txt deleted file mode 100644 index aa697b0c4081f4..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/env.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Check for simple one environment variable setting -# -# RUN: env A_FOO=999 %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-1 %s -# -# Check for multiple environment variable settings -# -# RUN: env A_FOO=1 B_BAR=2 C_OOF=3 %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s - -# CHECK-ENV-1: A_FOO = 999 - -# CHECK-ENV-MULTIPLE: A_FOO = 1 -# CHECK-ENV-MULTIPLE: B_BAR = 2 -# CHECK-ENV-MULTIPLE: C_OOF = 3 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt b/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt deleted file mode 100644 index be32d458843bc3..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/mixed.txt +++ /dev/null @@ -1,18 +0,0 @@ -# Check for setting and removing one environment variable -# -# RUN: env A_FOO=999 -u FOO %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-1 %s -# -# Check for setting/unsetting multiple environment variables -# -# RUN: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 %{python} print_environment.py \ -# RUN: | FileCheck --check-prefix=CHECK-ENV-MULTIPLE %s - -# CHECK-ENV-1: A_FOO = 999 -# CHECK-ENV-1-NOT: FOO - -# CHECK-ENV-MULTIPLE: A_FOO = 1 -# CHECK-ENV-MULTIPLE-NOT: BAR -# CHECK-ENV-MULTIPLE: B_BAR = 2 -# CHECK-ENV-MULTIPLE: C_OOF = 3 -# CHECK-ENV-MULTIPLE-NOT: FOO diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py b/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py deleted file mode 100644 index e39bd73e44a108..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-env/print_environment.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function -import os - -sorted_environment = sorted(os.environ.items()) - -for name, value in sorted_environment: - print(name, "=", value) diff --git a/llvm/utils/lit/tests/shtest-env-negative.py b/llvm/utils/lit/tests/shtest-env-negative.py new file mode 100644 index 00000000000000..c8b59b224e7c43 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-env-negative.py @@ -0,0 +1,49 @@ +## Test the env command (failing tests). + +# RUN: not %{lit} -a -v %{inputs}/shtest-env-negative \ +# RUN: | FileCheck -match-full-lines %s +# +# END. + +## Test the env command's expected failures. + +# CHECK: -- Testing: 7 tests{{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-cd.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 cd foobar +# CHECK: # executed command: env -u FOO BAR=3 cd foobar +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-colon.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 : +# CHECK: # executed command: env -u FOO BAR=3 : +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-echo.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 echo hello world +# CHECK: # executed command: env -u FOO BAR=3 echo hello world +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-export.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 export BAZ=3 +# CHECK: # executed command: env -u FOO BAR=3 export BAZ=3 +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-mkdir.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 mkdir foobar +# CHECK: # executed command: env -u FOO BAR=3 mkdir foobar +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-not-builtin.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 not rm {{.+}}.no-such-file +# CHECK: # executed command: env -u FOO BAR=3 not rm {{.+}}.no-such-file{{.*}} +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: FAIL: shtest-env :: env-calls-rm.txt ({{[^)]*}}) +# CHECK: env -u FOO BAR=3 rm foobar +# CHECK: # executed command: env -u FOO BAR=3 rm foobar +# CHECK: # error: command failed with exit status: {{.*}} + +# CHECK: Total Discovered Tests: 7 +# CHECK: Failed: 7 {{\([0-9]*\.[0-9]*%\)}} +# CHECK-NOT: {{.}} diff --git a/llvm/utils/lit/tests/shtest-env-positive.py b/llvm/utils/lit/tests/shtest-env-positive.py new file mode 100644 index 00000000000000..863fbda8c5b6dc --- /dev/null +++ b/llvm/utils/lit/tests/shtest-env-positive.py @@ -0,0 +1,70 @@ +## Test the env command (passing tests). + +# RUN: %{lit} -a -v %{inputs}/shtest-env-positive \ +# RUN: | FileCheck -match-full-lines %s +# +# END. + +## Test the env command's successful executions. + +# CHECK: -- Testing: 9 tests{{.*}} + +# CHECK: PASS: shtest-env :: env-args-last-is-assign.txt ({{[^)]*}}) +# CHECK: env FOO=1 +# CHECK: # executed command: env FOO=1 +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-args-last-is-u-arg.txt ({{[^)]*}}) +# CHECK: env -u FOO +# CHECK: # executed command: env -u FOO +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-args-last-is-u.txt ({{[^)]*}}) +# CHECK: env -u +# CHECK: # executed command: env -u +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-args-nested-none.txt ({{[^)]*}}) +# CHECK: env env env +# CHECK: # executed command: env env env +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-calls-env.txt ({{[^)]*}}) +# CHECK: env env | {{.*}} +# CHECK: # executed command: env env +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-no-subcommand.txt ({{[^)]*}}) +# CHECK: env | {{.*}} +# CHECK: # executed command: env +# CHECK: env FOO=2 BAR=1 | {{.*}} +# CHECK: # executed command: env FOO=2 BAR=1 +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env-u.txt ({{[^)]*}}) +# CHECK: env -u FOO | {{.*}} +# CHECK: # executed command: env -u FOO +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: env.txt ({{[^)]*}}) +# CHECK: env A_FOO=999 | {{.*}} +# CHECK: # executed command: env A_FOO=999 +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: PASS: shtest-env :: mixed.txt ({{[^)]*}}) +# CHECK: env A_FOO=999 -u FOO | {{.*}} +# CHECK: # executed command: env A_FOO=999 -u FOO +# CHECK-NOT: # error: +# CHECK: -- + +# CHECK: Total Discovered Tests: 9 +# CHECK: Passed: 9 {{\([0-9]*\.[0-9]*%\)}} +# CHECK-NOT: {{.}} diff --git a/llvm/utils/lit/tests/shtest-env.py b/llvm/utils/lit/tests/shtest-env.py deleted file mode 100644 index 03bb4a3cae7dd1..00000000000000 --- a/llvm/utils/lit/tests/shtest-env.py +++ /dev/null @@ -1,126 +0,0 @@ -# Check the env command - -# RUN: not %{lit} -a -v %{inputs}/shtest-env \ -# RUN: | FileCheck -match-full-lines %s -# -# END. - -# Make sure env commands are included in printed commands. - -# CHECK: -- Testing: 16 tests{{.*}} - -# CHECK: FAIL: shtest-env :: env-args-last-is-assign.txt ({{[^)]*}}) -# CHECK: env FOO=1 -# CHECK: # executed command: env FOO=1 -# CHECK: # | Error: 'env' requires a subcommand -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-args-last-is-u-arg.txt ({{[^)]*}}) -# CHECK: env -u FOO -# CHECK: # executed command: env -u FOO -# CHECK: # | Error: 'env' requires a subcommand -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-args-last-is-u.txt ({{[^)]*}}) -# CHECK: env -u -# CHECK: # executed command: env -u -# CHECK: # | Error: 'env' requires a subcommand -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-args-nested-none.txt ({{[^)]*}}) -# CHECK: env env env -# CHECK: # executed command: env env env -# CHECK: # | Error: 'env' requires a subcommand -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-args-none.txt ({{[^)]*}}) -# CHECK: env -# CHECK: # executed command: env -# CHECK: # | Error: 'env' requires a subcommand -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-cd.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 cd foobar -# CHECK: # executed command: env -u FOO BAR=3 cd foobar -# CHECK: # | Error: 'env' cannot call 'cd' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-colon.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 : -# CHECK: # executed command: env -u FOO BAR=3 : -# CHECK: # | Error: 'env' cannot call ':' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-echo.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 echo hello world -# CHECK: # executed command: env -u FOO BAR=3 echo hello world -# CHECK: # | Error: 'env' cannot call 'echo' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: PASS: shtest-env :: env-calls-env.txt ({{[^)]*}}) -# CHECK: env env [[PYTHON:.+]] print_environment.py | {{.*}} -# CHECK: # executed command: env env [[PYTHON_BARE:.+]] print_environment.py -# CHECK: env FOO=2 env BAR=1 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env FOO=2 env BAR=1 [[PYTHON_BARE]] print_environment.py -# CHECK: env -u FOO env -u BAR [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env -u FOO env -u BAR [[PYTHON_BARE]] print_environment.py -# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON_BARE]] print_environment.py -# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON_BARE]] print_environment.py -# CHECK-NOT: {{^[^#]}} -# CHECK: -- - -# CHECK: FAIL: shtest-env :: env-calls-export.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 export BAZ=3 -# CHECK: # executed command: env -u FOO BAR=3 export BAZ=3 -# CHECK: # | Error: 'env' cannot call 'export' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-mkdir.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 mkdir foobar -# CHECK: # executed command: env -u FOO BAR=3 mkdir foobar -# CHECK: # | Error: 'env' cannot call 'mkdir' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-not-builtin.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 not rm {{.+}}.no-such-file -# CHECK: # executed command: env -u FOO BAR=3 not rm {{.+}}.no-such-file{{.*}} -# CHECK: # | Error: 'env' cannot call 'rm' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: FAIL: shtest-env :: env-calls-rm.txt ({{[^)]*}}) -# CHECK: env -u FOO BAR=3 rm foobar -# CHECK: # executed command: env -u FOO BAR=3 rm foobar -# CHECK: # | Error: 'env' cannot call 'rm' -# CHECK: # error: command failed with exit status: {{.*}} - -# CHECK: PASS: shtest-env :: env-u.txt ({{[^)]*}}) -# CHECK: [[PYTHON]] print_environment.py | {{.*}} -# CHECK: env -u FOO [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env -u FOO [[PYTHON_BARE]] print_environment.py -# CHECK: env -u FOO -u BAR [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env -u FOO -u BAR [[PYTHON_BARE]] print_environment.py -# CHECK-NOT: {{^[^#]}} -# CHECK: -- - -# CHECK: PASS: shtest-env :: env.txt ({{[^)]*}}) -# CHECK: env A_FOO=999 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env A_FOO=999 [[PYTHON_BARE]] print_environment.py -# CHECK: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON_BARE]] print_environment.py -# CHECK-NOT: {{^[^#]}} -# CHECK: -- - -# CHECK: PASS: shtest-env :: mixed.txt ({{[^)]*}}) -# CHECK: env A_FOO=999 -u FOO [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env A_FOO=999 -u FOO [[PYTHON_BARE]] print_environment.py -# CHECK: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON]] print_environment.py | {{.*}} -# CHECK: # executed command: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON_BARE]] print_environment.py -# CHECK-NOT: {{^[^#]}} -# CHECK: -- - -# CHECK: Total Discovered Tests: 16 -# CHECK: Passed: 4 {{\([0-9]*\.[0-9]*%\)}} -# CHECK: Failed: 12 {{\([0-9]*\.[0-9]*%\)}} -# CHECK-NOT: {{.}} From 2dc3b509879518340b991733bfde5c7a4becd559 Mon Sep 17 00:00:00 2001 From: Greg Roth Date: Thu, 29 Aug 2024 11:01:52 -0600 Subject: [PATCH 22/72] [HLSL] Apply NoRecurse attrib to all HLSL functions (#105907) Previously, functions named "main" got the NoRecurse attribute consistent with the behavior of C++, which HLSL largely follows. However, standard recursion is not allowed in HLSL, so all functions should really have this attribute. This doesn't prevent recursion, but rather signals that these functions aren't expected to recurse. Practically, this was done so that entry point functions named "main" would have all have the same attributes as otherwise identical entry points with other names. This required small changes to the this assignment tests because they no longer generate so many attribute sets since more of them match. related to #105244 but done to simplify testing for #89806 --- clang/lib/CodeGen/CodeGenFunction.cpp | 10 +- .../implicit-norecurse-attrib.hlsl | 93 +++++++++++++++++++ .../CodeGenHLSL/this-assignment-overload.hlsl | 4 +- clang/test/CodeGenHLSL/this-assignment.hlsl | 4 +- 4 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index c89eaa0f4e3bfc..a5747283e98058 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1064,13 +1064,17 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, // OpenCL C 2.0 v2.2-11 s6.9.i: // Recursion is not supported. // + // HLSL + // Recursion is not supported. + // // SYCL v1.2.1 s3.10: // kernels cannot include RTTI information, exception classes, // recursive code, virtual functions or make use of C++ libraries that // are not compiled for the device. - if (FD && ((getLangOpts().CPlusPlus && FD->isMain()) || - getLangOpts().OpenCL || getLangOpts().SYCLIsDevice || - (getLangOpts().CUDA && FD->hasAttr()))) + if (FD && + ((getLangOpts().CPlusPlus && FD->isMain()) || getLangOpts().OpenCL || + getLangOpts().HLSL || getLangOpts().SYCLIsDevice || + (getLangOpts().CUDA && FD->hasAttr()))) Fn->addFnAttr(llvm::Attribute::NoRecurse); llvm::RoundingMode RM = getLangOpts().getDefaultRoundingMode(); diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl new file mode 100644 index 00000000000000..ae3a3b5f90199f --- /dev/null +++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl @@ -0,0 +1,93 @@ +// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.3-library -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.0-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +// Verify that a few different function types all get the NoRecurse attribute + +#define MAX 100 + +struct Node { + uint value; + uint key; + uint left, right; +}; + +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define noundef i32 @"?Find@@YAIY0GE@UNode@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]] +// CHECK: ret i32 +// Find and return value corresponding to key in the SortedTree +uint Find(Node SortedTree[MAX], uint key) { + uint nix = 0; // head + while(true) { + if (nix < 0) + return 0.0; // Not found + Node n = SortedTree[nix]; + if (n.key == key) + return n.value; + if (key < n.key) + nix = n.left; + else + nix = n.right; + } +} + +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]] +// CHECK: ret i1 +// Initialize tree with given buffer +// Imagine the inout works +export +bool InitTree(/*inout*/ Node tree[MAX], RWBuffer encodedTree, uint maxDepth) { + uint size = pow(2.f, maxDepth) - 1; + if (size > MAX) return false; + for (uint i = 1; i < size; i++) { + tree[i].value = encodedTree[i].x; + tree[i].key = encodedTree[i].y; + tree[i].left = encodedTree[i].z; + tree[i].right = encodedTree[i].w; + } + return true; +} + +RWBuffer gTree; + +// Mangled entry points are internal +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]] +// CHECK: ret void + +// Canonical entry points are external and shader attributed +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define void @main() [[EntryAttr:\#[0-9]+]] +// CHECK: ret void + +[numthreads(1,1,1)] +[shader("compute")] +void main(uint GI : SV_GroupIndex) { + Node haystack[MAX]; + uint needle = 0; + if (InitTree(haystack, gTree, GI)) + needle = Find(haystack, needle); +} + +// Mangled entry points are internal +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define internal void @"?defaultMain@@YAXXZ"() [[IntAttr]] +// CHECK: ret void + +// Canonical entry points are external and shader attributed +// CHECK: Function Attrs:{{.*}}norecurse +// CHECK: define void @defaultMain() [[EntryAttr]] +// CHECK: ret void + +[numthreads(1,1,1)] +[shader("compute")] +void defaultMain() { + Node haystack[MAX]; + uint needle = 0; + if (InitTree(haystack, gTree, 4)) + needle = Find(haystack, needle); +} + +// CHECK: attributes [[IntAttr]] = {{.*}} norecurse +// CHECK: attributes [[ExtAttr]] = {{.*}} norecurse +// CHECK: attributes [[EntryAttr]] = {{.*}} norecurse diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl index 0c4905e0f45980..f0affcb69a3fcd 100644 --- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl +++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl @@ -25,7 +25,7 @@ void main() { } // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators. -// CHECK: define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 { +// CHECK: define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4 @@ -42,7 +42,7 @@ void main() { // CHECK-NEXT:%0 = load i32, ptr %First2, align 4 // CHECK-NEXT:ret i32 %0 -// CHECK: define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 { +// CHECK: define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4 diff --git a/clang/test/CodeGenHLSL/this-assignment.hlsl b/clang/test/CodeGenHLSL/this-assignment.hlsl index 6916afcde40546..5c8de0a18ef7ca 100644 --- a/clang/test/CodeGenHLSL/this-assignment.hlsl +++ b/clang/test/CodeGenHLSL/this-assignment.hlsl @@ -24,7 +24,7 @@ void main() { } // This tests reference like implicit this in HLSL -// CHECK: define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 { +// CHECK: define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4 @@ -34,7 +34,7 @@ void main() { // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false) // CHECK-NEXT:%First = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0 -// CHECK: define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 { +// CHECK: define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 { // CHECK-NEXT:entry: // CHECK-NEXT:%this.addr = alloca ptr, align 4 // CHECK-NEXT:%ref.tmp = alloca %struct.Pair, align 4 From ecd65e64e885b0fd2786ca99ea0c42d692275d91 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Thu, 29 Aug 2024 10:02:45 -0700 Subject: [PATCH 23/72] [DXIL][test] Fix a few tests now that HLSL functions are internalized (#106437) These tests have been failing since db279c72f2fe "[HLSL] Change default linkage of HLSL functions to internal (#95331)". This presumably went unnoticed because they're not run by default since they rely on an external tool (dxil-dis). --- llvm/test/tools/dxil-dis/BasicIR.ll | 2 +- llvm/test/tools/dxil-dis/debug-info.ll | 2 +- llvm/test/tools/dxil-dis/opaque-gep.ll | 4 ++-- llvm/test/tools/dxil-dis/opaque-pointers.ll | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/test/tools/dxil-dis/BasicIR.ll b/llvm/test/tools/dxil-dis/BasicIR.ll index a79365a938023d..8add2f2bd263ba 100644 --- a/llvm/test/tools/dxil-dis/BasicIR.ll +++ b/llvm/test/tools/dxil-dis/BasicIR.ll @@ -2,7 +2,7 @@ ; RUN: llc --filetype=obj %s --stop-after=dxil-write-bitcode -o %t && llvm-bcanalyzer --dump-blockinfo %t | FileCheck %s --check-prefix=BLOCK_INFO -; CHECK: define i32 @foo(i32 %X, i32 %Y) { +; CHECK: define internal i32 @foo(i32 %X, i32 %Y) { ; CHECK: %Z = sub i32 %X, %Y ; CHECK: %Q = add i32 %Z, %Y ; CHECK: ret i32 %Q diff --git a/llvm/test/tools/dxil-dis/debug-info.ll b/llvm/test/tools/dxil-dis/debug-info.ll index 96e023338e5c26..0b40f275af3cd4 100644 --- a/llvm/test/tools/dxil-dis/debug-info.ll +++ b/llvm/test/tools/dxil-dis/debug-info.ll @@ -2,7 +2,7 @@ target triple = "dxil-unknown-shadermodel6.7-library" target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -; CHECK: define float @fma(float, float, float) unnamed_addr #0 !dbg [[Fn:[!][0-9]+]] +; CHECK: define internal float @fma(float, float, float) unnamed_addr #0 !dbg [[Fn:[!][0-9]+]] ; Function Attrs: norecurse nounwind readnone willreturn define dso_local float @fma(float %0, float %1, float %2) local_unnamed_addr #0 !dbg !6 { ; CHECK-NEXT: call void @llvm.dbg.value(metadata float %0, metadata [[VarX:[!][0-9]+]], metadata [[Expr:[!][0-9]+]]), !dbg [[Line1:[!][0-9]+]] diff --git a/llvm/test/tools/dxil-dis/opaque-gep.ll b/llvm/test/tools/dxil-dis/opaque-gep.ll index abec0955f5a6d6..3da57e65b93c4e 100644 --- a/llvm/test/tools/dxil-dis/opaque-gep.ll +++ b/llvm/test/tools/dxil-dis/opaque-gep.ll @@ -7,7 +7,7 @@ define i32 @fn(ptr %0) { ret i32 %3 } -; CHECK: define i32 @fn(i32*) +; CHECK: define internal i32 @fn(i32*) ; CHECK-NEXT: %2 = getelementptr i32, i32* %0, i32 4 ; CHECK-NEXT: %3 = load i32, i32* %2, align 4 @@ -17,6 +17,6 @@ define i32 @fn2(ptr addrspace(1) %0) { ret i32 %3 } -; CHECK: define i32 @fn2(i32 addrspace(1)*) +; CHECK: define internal i32 @fn2(i32 addrspace(1)*) ; CHECK-NEXT: %2 = getelementptr i32, i32 addrspace(1)* %0, i32 4 ; CHECK-NEXT: %3 = load i32, i32 addrspace(1)* %2, align 4 diff --git a/llvm/test/tools/dxil-dis/opaque-pointers.ll b/llvm/test/tools/dxil-dis/opaque-pointers.ll index 81f6a4ca97f610..87e23919629e21 100644 --- a/llvm/test/tools/dxil-dis/opaque-pointers.ll +++ b/llvm/test/tools/dxil-dis/opaque-pointers.ll @@ -7,7 +7,7 @@ define i64 @test(ptr %p) { ret i64 %v } -; CHECK: define i64 @test(i8* %p) { +; CHECK: define internal i64 @test(i8* %p) { ; CHECK-NEXT: %1 = bitcast i8* %p to i32* ; CHECK-NEXT: store i32 0, i32* %1, align 4 ; CHECK-NEXT: %2 = bitcast i8* %p to i64* @@ -19,7 +19,7 @@ define i64 @test2(ptr %p) { ret i64 %v } -; CHECK: define i64 @test2(i64* %p) { +; CHECK: define internal i64 @test2(i64* %p) { ; CHECK-NEXT: store i64 0, i64* %p, align 8 ; CHECK-NEXT: %v = load i64, i64* %p, align 8 @@ -29,7 +29,7 @@ define i64 @test3(ptr addrspace(1) %p) { ret i64 %v } -; CHECK: define i64 @test3(i8 addrspace(1)* %p) { +; CHECK: define internal i64 @test3(i8 addrspace(1)* %p) { ; CHECK-NEXT: %1 = bitcast i8 addrspace(1)* %p to i32 addrspace(1)* ; CHECK-NEXT: store i32 0, i32 addrspace(1)* %1, align 4 ; CHECK-NEXT: %2 = bitcast i8 addrspace(1)* %p to i64 addrspace(1)* @@ -41,7 +41,7 @@ define i64 @test4(ptr addrspace(1) %p) { ret i64 %v } -; CHECK: define i64 @test4(i64 addrspace(1)* %p) { +; CHECK: define internal i64 @test4(i64 addrspace(1)* %p) { ; CHECK-NEXT: store i64 0, i64 addrspace(1)* %p, align 8 ; CHECK-NEXT: %v = load i64, i64 addrspace(1)* %p, align 8 @@ -53,7 +53,7 @@ define i64 @test5(ptr %p) { ret i64 %v } -; CHECK: define i64 @test5(i8* %p) { +; CHECK: define internal i64 @test5(i8* %p) { ; CHECK-NEXT: %casted = addrspacecast i8* %p to i64 addrspace(1)* ; CHECK-NEXT: store i64 0, i64 addrspace(1)* %casted, align 8 ; CHECK-NEXT: %v = load i64, i64 addrspace(1)* %casted, align 8 From 2ad782f49ff20d199f31cabc9baa46dba6047d8b Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 10:14:57 -0700 Subject: [PATCH 24/72] [VP] Kill VP_PROPERTY_(MEMOP,CASTOP) and simplify _CONSTRAINEDFP [nfc] (#105574) These lists are quite static. Heavy use of macros is undesirable, and not idiomatic in LLVM, so let's just use the naive switch cases. Note that the first two fields in the CONSTRAINEDFP property were utterly unused (aside from a C++ test). In the same vien as https://github.com/llvm/llvm-project/pull/105551. Once both changes have landed, we'll be left with _BINARYOP which needs a bit of additional untangling, and the actual opcode mappings. --- llvm/include/llvm/IR/VPIntrinsics.def | 58 +++++++-------------------- llvm/lib/IR/IntrinsicInst.cpp | 39 +++++++++--------- llvm/unittests/IR/VPIntrinsicTest.cpp | 16 -------- 3 files changed, 33 insertions(+), 80 deletions(-) diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 3fad00e2caf21f..e81752dc33a9ab 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -95,15 +95,10 @@ #define VP_PROPERTY_FUNCTIONAL_OPC(OPC) #endif -// Whether the intrinsic may have a rounding mode or exception behavior operand -// bundle. -// \p HASROUND '1' if the intrinsic can have a rounding mode operand bundle, -// '0' otherwise. -// \p HASEXCEPT '1' if the intrinsic can have an exception behavior operand -// bundle, '0' otherwise. -// \p INTRINID The constrained fp intrinsic this VP intrinsic corresponds to. +// If operation can have rounding or fp exceptions, maps to corresponding +// constrained fp intrinsic. #ifndef VP_PROPERTY_CONSTRAINEDFP -#define VP_PROPERTY_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID) +#define VP_PROPERTY_CONSTRAINEDFP(INTRINID) #endif // The intrinsic and/or SDNode has the same function as this ISD Opcode. @@ -123,22 +118,11 @@ #define VP_PROPERTY_NO_FUNCTIONAL #endif -// This VP Intrinsic is a memory operation -// The pointer arg is at POINTERPOS and the data arg is at DATAPOS. -#ifndef VP_PROPERTY_MEMOP -#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) -#endif - // A property to infer VP binary-op SDNode opcodes automatically. #ifndef VP_PROPERTY_BINARYOP #define VP_PROPERTY_BINARYOP #endif -// A property to infer VP type casts automatically. -#ifndef VP_PROPERTY_CASTOP -#define VP_PROPERTY_CASTOP -#endif - /// } Property Macros ///// Integer Arithmetic { @@ -327,7 +311,7 @@ END_REGISTER_VP(vp_usub_sat, VP_USUBSAT) #define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, VPSD, IROPC, SDOPC) \ BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, VPSD, -1) \ VP_PROPERTY_FUNCTIONAL_OPC(IROPC) \ - VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX) \ + VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_##OPSUFFIX) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ VP_PROPERTY_BINARYOP \ END_REGISTER_VP(vp_##OPSUFFIX, VPSD) @@ -369,14 +353,14 @@ END_REGISTER_VP(vp_sqrt, VP_SQRT) // llvm.vp.fma(x,y,z,mask,vlen) BEGIN_REGISTER_VP(vp_fma, 3, 4, VP_FMA, -1) -VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_fma) +VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fma) VP_PROPERTY_FUNCTIONAL_INTRINSIC(fma) VP_PROPERTY_FUNCTIONAL_SDOPC(FMA) END_REGISTER_VP(vp_fma, VP_FMA) // llvm.vp.fmuladd(x,y,z,mask,vlen) BEGIN_REGISTER_VP(vp_fmuladd, 3, 4, VP_FMULADD, -1) -VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_fmuladd) +VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fmuladd) VP_PROPERTY_FUNCTIONAL_INTRINSIC(fmuladd) VP_PROPERTY_FUNCTIONAL_SDOPC(FMAD) END_REGISTER_VP(vp_fmuladd, VP_FMULADD) @@ -479,31 +463,30 @@ END_REGISTER_VP(vp_llrint, VP_LLRINT) #error \ "The internal helper macro HELPER_REGISTER_FP_CAST_VP is already defined!" #endif -#define HELPER_REGISTER_FP_CAST_VP(OPSUFFIX, VPSD, IROPC, SDOPC, HASROUND) \ +#define HELPER_REGISTER_FP_CAST_VP(OPSUFFIX, VPSD, IROPC, SDOPC) \ BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1) \ VP_PROPERTY_FUNCTIONAL_OPC(IROPC) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ - VP_PROPERTY_CONSTRAINEDFP(HASROUND, 1, experimental_constrained_##OPSUFFIX) \ - VP_PROPERTY_CASTOP \ + VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_##OPSUFFIX) \ END_REGISTER_VP(vp_##OPSUFFIX, VPSD) // llvm.vp.fptoui(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(fptoui, VP_FP_TO_UINT, FPToUI, FP_TO_UINT, 0) +HELPER_REGISTER_FP_CAST_VP(fptoui, VP_FP_TO_UINT, FPToUI, FP_TO_UINT) // llvm.vp.fptosi(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(fptosi, VP_FP_TO_SINT, FPToSI, FP_TO_SINT, 0) +HELPER_REGISTER_FP_CAST_VP(fptosi, VP_FP_TO_SINT, FPToSI, FP_TO_SINT) // llvm.vp.uitofp(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(uitofp, VP_UINT_TO_FP, UIToFP, UINT_TO_FP, 1) +HELPER_REGISTER_FP_CAST_VP(uitofp, VP_UINT_TO_FP, UIToFP, UINT_TO_FP) // llvm.vp.sitofp(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(sitofp, VP_SINT_TO_FP, SIToFP, SINT_TO_FP, 1) +HELPER_REGISTER_FP_CAST_VP(sitofp, VP_SINT_TO_FP, SIToFP, SINT_TO_FP) // llvm.vp.fptrunc(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(fptrunc, VP_FP_ROUND, FPTrunc, FP_ROUND, 1) +HELPER_REGISTER_FP_CAST_VP(fptrunc, VP_FP_ROUND, FPTrunc, FP_ROUND) // llvm.vp.fpext(x,mask,vlen) -HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND, 0) +HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND) #undef HELPER_REGISTER_FP_CAST_VP @@ -517,7 +500,6 @@ HELPER_REGISTER_FP_CAST_VP(fpext, VP_FP_EXTEND, FPExt, FP_EXTEND, 0) BEGIN_REGISTER_VP(vp_##OPSUFFIX, 1, 2, VPSD, -1) \ VP_PROPERTY_FUNCTIONAL_OPC(IROPC) \ VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ - VP_PROPERTY_CASTOP \ END_REGISTER_VP(vp_##OPSUFFIX, VPSD) // llvm.vp.trunc(x,mask,vlen) @@ -532,13 +514,11 @@ HELPER_REGISTER_INT_CAST_VP(sext, VP_SIGN_EXTEND, SExt, SIGN_EXTEND) // llvm.vp.ptrtoint(x,mask,vlen) BEGIN_REGISTER_VP(vp_ptrtoint, 1, 2, VP_PTRTOINT, -1) VP_PROPERTY_FUNCTIONAL_OPC(PtrToInt) -VP_PROPERTY_CASTOP END_REGISTER_VP(vp_ptrtoint, VP_PTRTOINT) // llvm.vp.inttoptr(x,mask,vlen) BEGIN_REGISTER_VP(vp_inttoptr, 1, 2, VP_INTTOPTR, -1) VP_PROPERTY_FUNCTIONAL_OPC(IntToPtr) -VP_PROPERTY_CASTOP END_REGISTER_VP(vp_inttoptr, VP_INTTOPTR) #undef HELPER_REGISTER_INT_CAST_VP @@ -555,7 +535,7 @@ END_REGISTER_VP_SDNODE(VP_SETCC) BEGIN_REGISTER_VP_INTRINSIC(vp_fcmp, 3, 4) HELPER_MAP_VPID_TO_VPSD(vp_fcmp, VP_SETCC) VP_PROPERTY_FUNCTIONAL_OPC(FCmp) -VP_PROPERTY_CONSTRAINEDFP(0, 1, experimental_constrained_fcmp) +VP_PROPERTY_CONSTRAINEDFP(experimental_constrained_fcmp) END_REGISTER_VP_INTRINSIC(vp_fcmp) // llvm.vp.icmp(x,y,cc,mask,vlen) @@ -579,7 +559,6 @@ BEGIN_REGISTER_VP_SDNODE(VP_STORE, 1, vp_store, 4, 5) HELPER_MAP_VPID_TO_VPSD(vp_store, VP_STORE) VP_PROPERTY_FUNCTIONAL_OPC(Store) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_store) -VP_PROPERTY_MEMOP(1, 0) END_REGISTER_VP(vp_store, VP_STORE) // llvm.experimental.vp.strided.store(val,ptr,stride,mask,vlen) @@ -588,7 +567,6 @@ BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_store, 3, 4) VP_PROPERTY_NO_FUNCTIONAL BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_STORE, 1, experimental_vp_strided_store, 5, 6) HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE) -VP_PROPERTY_MEMOP(1, 0) END_REGISTER_VP(experimental_vp_strided_store, EXPERIMENTAL_VP_STRIDED_STORE) // llvm.vp.scatter(ptr,val,mask,vlen) @@ -597,7 +575,6 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3) BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, 1, vp_scatter, 5, 6) HELPER_MAP_VPID_TO_VPSD(vp_scatter, VP_SCATTER) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_scatter) -VP_PROPERTY_MEMOP(1, 0) END_REGISTER_VP(vp_scatter, VP_SCATTER) // llvm.vp.load(ptr,mask,vlen) @@ -607,7 +584,6 @@ BEGIN_REGISTER_VP_SDNODE(VP_LOAD, -1, vp_load, 3, 4) HELPER_MAP_VPID_TO_VPSD(vp_load, VP_LOAD) VP_PROPERTY_FUNCTIONAL_OPC(Load) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load) -VP_PROPERTY_MEMOP(0, std::nullopt) END_REGISTER_VP(vp_load, VP_LOAD) // llvm.experimental.vp.strided.load(ptr,stride,mask,vlen) @@ -616,7 +592,6 @@ BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3) VP_PROPERTY_NO_FUNCTIONAL BEGIN_REGISTER_VP_SDNODE(EXPERIMENTAL_VP_STRIDED_LOAD, -1, experimental_vp_strided_load, 4, 5) HELPER_MAP_VPID_TO_VPSD(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD) -VP_PROPERTY_MEMOP(0, std::nullopt) END_REGISTER_VP(experimental_vp_strided_load, EXPERIMENTAL_VP_STRIDED_LOAD) // llvm.vp.gather(ptr,mask,vlen) @@ -625,7 +600,6 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_gather, 1, 2) BEGIN_REGISTER_VP_SDNODE(VP_GATHER, -1, vp_gather, 4, 5) HELPER_MAP_VPID_TO_VPSD(vp_gather, VP_GATHER) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_gather) -VP_PROPERTY_MEMOP(0, std::nullopt) END_REGISTER_VP(vp_gather, VP_GATHER) ///// } Memory Operations @@ -778,10 +752,8 @@ END_REGISTER_VP(experimental_vp_splat, EXPERIMENTAL_VP_SPLAT) #undef END_REGISTER_VP_SDNODE #undef HELPER_MAP_VPID_TO_VPSD #undef VP_PROPERTY_BINARYOP -#undef VP_PROPERTY_CASTOP #undef VP_PROPERTY_CONSTRAINEDFP #undef VP_PROPERTY_FUNCTIONAL_INTRINSIC #undef VP_PROPERTY_FUNCTIONAL_OPC #undef VP_PROPERTY_FUNCTIONAL_SDOPC #undef VP_PROPERTY_NO_FUNCTIONAL -#undef VP_PROPERTY_MEMOP diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 966fa62abd94fe..7ed82c2ece464a 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -479,13 +479,16 @@ std::optional VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) { switch (VPID) { default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_MEMOP(POINTERPOS, ...) return POINTERPOS; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" + return std::nullopt; + case Intrinsic::vp_store: + case Intrinsic::vp_scatter: + case Intrinsic::experimental_vp_strided_store: + return 1; + case Intrinsic::vp_load: + case Intrinsic::vp_gather: + case Intrinsic::experimental_vp_strided_load: + return 0; } - return std::nullopt; } /// \return The data (payload) operand of this store or scatter. @@ -499,13 +502,12 @@ Value *VPIntrinsic::getMemoryDataParam() const { std::optional VPIntrinsic::getMemoryDataParamPos(Intrinsic::ID VPID) { switch (VPID) { default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) return DATAPOS; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" + return std::nullopt; + case Intrinsic::vp_store: + case Intrinsic::vp_scatter: + case Intrinsic::experimental_vp_strided_store: + return 0; } - return std::nullopt; } constexpr bool isVPIntrinsic(Intrinsic::ID ID) { @@ -589,7 +591,7 @@ VPIntrinsic::getConstrainedIntrinsicIDForVP(Intrinsic::ID ID) { default: break; #define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_CONSTRAINEDFP(HASRND, HASEXCEPT, CID) return Intrinsic::CID; +#define VP_PROPERTY_CONSTRAINEDFP(CID) return Intrinsic::CID; #define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } @@ -760,14 +762,9 @@ bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) { } bool VPCastIntrinsic::isVPCast(Intrinsic::ID ID) { - switch (ID) { - default: - break; -#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define VP_PROPERTY_CASTOP return true; -#define END_REGISTER_VP_INTRINSIC(VPID) break; -#include "llvm/IR/VPIntrinsics.def" - } + // All of the vp.casts correspond to instructions + if (std::optional Opc = getFunctionalOpcodeForVP(ID)) + return Instruction::isCast(*Opc); return false; } diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index cf0a10d1f2e959..925a69bafa07ef 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -454,22 +454,6 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) { } } -/// Check that the HANDLE_VP_TO_CONSTRAINEDFP maps to an existing intrinsic with -/// the right amount of constrained-fp metadata args. -TEST_F(VPIntrinsicTest, HandleToConstrainedFP) { -#define VP_PROPERTY_CONSTRAINEDFP(HASROUND, HASEXCEPT, CFPID) \ - { \ - SmallVector T; \ - Intrinsic::getIntrinsicInfoTableEntries(Intrinsic::CFPID, T); \ - unsigned NumMetadataArgs = 0; \ - for (auto TD : T) \ - NumMetadataArgs += (TD.Kind == Intrinsic::IITDescriptor::Metadata); \ - bool IsCmp = Intrinsic::CFPID == Intrinsic::experimental_constrained_fcmp; \ - ASSERT_EQ(NumMetadataArgs, (unsigned)(IsCmp + HASROUND + HASEXCEPT)); \ - } -#include "llvm/IR/VPIntrinsics.def" -} - } // end anonymous namespace /// Check various properties of VPReductionIntrinsics From 0a00d32c5f88fce89006dcde6e235bc77d7b495e Mon Sep 17 00:00:00 2001 From: Jordan R AW Date: Thu, 29 Aug 2024 10:16:17 -0700 Subject: [PATCH 25/72] [lldb] Add armv7a and armv8a ArchSpecs (#106433) armv7a and armv8a are common names for the application subarch for arm. These names in particular are used in ChromeOS, Android, and a few other known applications. In ChromeOS, we encountered a bug where armv7a arch was not recognised and segfaulted when starting an executable on an arm32 device. Google Issue Tracker: https://issuetracker.google.com/361414339 --- lldb/include/lldb/Utility/ArchSpec.h | 2 ++ lldb/source/Utility/ArchSpec.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h index 50830b889b9115..5990f984b09e2d 100644 --- a/lldb/include/lldb/Utility/ArchSpec.h +++ b/lldb/include/lldb/Utility/ArchSpec.h @@ -123,6 +123,7 @@ class ArchSpec { eCore_arm_armv6, eCore_arm_armv6m, eCore_arm_armv7, + eCore_arm_armv7a, eCore_arm_armv7l, eCore_arm_armv7f, eCore_arm_armv7s, @@ -145,6 +146,7 @@ class ArchSpec { eCore_thumbv7em, eCore_arm_arm64, eCore_arm_armv8, + eCore_arm_armv8a, eCore_arm_armv8l, eCore_arm_arm64e, eCore_arm_arm64_32, diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index 4fd1a800023ce3..85bb85044ec156 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -60,6 +60,8 @@ static const CoreDefinition g_core_definitions[] = { "armv6m"}, {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7, "armv7"}, + {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7a, + "armv7a"}, {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7l, "armv7l"}, {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv7f, @@ -102,6 +104,8 @@ static const CoreDefinition g_core_definitions[] = { ArchSpec::eCore_arm_arm64, "arm64"}, {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64, ArchSpec::eCore_arm_armv8, "armv8"}, + {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64, + ArchSpec::eCore_arm_armv8a, "armv8a"}, {eByteOrderLittle, 4, 2, 4, llvm::Triple::arm, ArchSpec::eCore_arm_armv8l, "armv8l"}, {eByteOrderLittle, 8, 4, 4, llvm::Triple::aarch64, From ed37b5f6c341a2c72d1f5f0c016f0f8a41a9bf83 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 29 Aug 2024 10:30:11 -0700 Subject: [PATCH 26/72] Revert "[Support] Validate number of arguments passed to formatv()" (#106589) Reverts llvm/llvm-project#105745 Some bots are broken apparently. --- .../Checkers/StdLibraryFunctionsChecker.cpp | 5 +- llvm/benchmarks/CMakeLists.txt | 1 - llvm/benchmarks/FormatVariadicBM.cpp | 63 -------------- llvm/include/llvm/Support/FormatVariadic.h | 39 ++++----- llvm/lib/Support/FormatVariadic.cpp | 85 +++---------------- llvm/unittests/Support/FormatVariadicTest.cpp | 56 ++++++------ mlir/tools/mlir-tblgen/OpFormatGen.cpp | 4 +- 7 files changed, 58 insertions(+), 195 deletions(-) delete mode 100644 llvm/benchmarks/FormatVariadicBM.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 4f30b2a0e7e7da..8f4bd17afc8581 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1401,10 +1401,7 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, ErrnoNote = llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote); } else { - // Disable formatv() validation as the case note may not always have the - // {0} placeholder for function name. - CaseNote = - llvm::formatv(false, Case.getNote().str().c_str(), FunctionName); + CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName); } const SVal RV = Call.getReturnValue(); diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index e3366e6f3ffe19..713d4ccd3c5975 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -5,4 +5,3 @@ set(LLVM_LINK_COMPONENTS add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED) -add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp deleted file mode 100644 index c03ead400d0d5c..00000000000000 --- a/llvm/benchmarks/FormatVariadicBM.cpp +++ /dev/null @@ -1,63 +0,0 @@ -//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/Support/FormatVariadic.h" -#include -#include -#include - -using namespace llvm; -using namespace std; - -// Generate a list of format strings that have `NumReplacements` replacements -// by permuting the replacements and some literal text. -static vector getFormatStrings(int NumReplacements) { - vector Components; - for (int I = 0; I < NumReplacements; I++) - Components.push_back("{" + to_string(I) + "}"); - // Intersperse these with some other literal text (_). - const string_view Literal = "____"; - for (char C : Literal) - Components.push_back(string(1, C)); - - vector Formats; - do { - string Concat; - for (const string &C : Components) - Concat += C; - Formats.emplace_back(Concat); - } while (next_permutation(Components.begin(), Components.end())); - return Formats; -} - -// Generate the set of formats to exercise outside the benchmark code. -static const vector> Formats = { - getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), - getFormatStrings(4), getFormatStrings(5), -}; - -// Benchmark formatv() for a variety of format strings and 1-5 replacements. -static void BM_FormatVariadic(benchmark::State &state) { - for (auto _ : state) { - for (const string &Fmt : Formats[0]) - formatv(Fmt.c_str(), 1).str(); - for (const string &Fmt : Formats[1]) - formatv(Fmt.c_str(), 1, 2).str(); - for (const string &Fmt : Formats[2]) - formatv(Fmt.c_str(), 1, 2, 3).str(); - for (const string &Fmt : Formats[3]) - formatv(Fmt.c_str(), 1, 2, 3, 4).str(); - for (const string &Fmt : Formats[4]) - formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); - } -} - -BENCHMARK(BM_FormatVariadic); - -BENCHMARK_MAIN(); diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index f31ad70021579e..595f2cf559a428 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -67,20 +67,23 @@ class formatv_object_base { protected: StringRef Fmt; ArrayRef Adapters; - bool Validate; + + static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, + size_t &Align, char &Pad); + + static std::pair + splitLiteralAndReplacement(StringRef Fmt); formatv_object_base(StringRef Fmt, - ArrayRef Adapters, - bool Validate) - : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {} + ArrayRef Adapters) + : Fmt(Fmt), Adapters(Adapters) {} formatv_object_base(formatv_object_base const &rhs) = delete; formatv_object_base(formatv_object_base &&rhs) = default; public: void format(raw_ostream &S) const { - const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate); - for (const auto &R : Replacements) { + for (auto &R : parseFormatString(Fmt)) { if (R.Type == ReplacementType::Empty) continue; if (R.Type == ReplacementType::Literal) { @@ -98,10 +101,9 @@ class formatv_object_base { Align.format(S, R.Options); } } + static SmallVector parseFormatString(StringRef Fmt); - // Parse and optionally validate format string (in debug builds). - static SmallVector - parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate); + static std::optional parseReplacementItem(StringRef Spec); std::string str() const { std::string Result; @@ -147,8 +149,8 @@ template class formatv_object : public formatv_object_base { }; public: - formatv_object(StringRef Fmt, Tuple &&Params, bool Validate) - : formatv_object_base(Fmt, ParameterPointers, Validate), + formatv_object(StringRef Fmt, Tuple &&Params) + : formatv_object_base(Fmt, ParameterPointers), Parameters(std::move(Params)) { ParameterPointers = std::apply(create_adapters(), Parameters); } @@ -245,22 +247,15 @@ template class formatv_object : public formatv_object_base { // assertion. Otherwise, it will try to do something reasonable, but in general // the details of what that is are undefined. // - -// formatv() with validation enable/disable controlled by the first argument. template -inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) +inline auto formatv(const char *Fmt, Ts &&...Vals) -> formatv_object(Vals))...))> { using ParamTuple = decltype(std::make_tuple( support::detail::build_format_adapter(std::forward(Vals))...)); - auto Params = std::make_tuple( - support::detail::build_format_adapter(std::forward(Vals))...); - return formatv_object(Fmt, std::move(Params), Validate); -} - -// formatv() with validation enabled. -template inline auto formatv(const char *Fmt, Ts &&...Vals) { - return formatv(true, Fmt, std::forward(Vals)...); + return formatv_object( + Fmt, std::make_tuple(support::detail::build_format_adapter( + std::forward(Vals))...)); } } // end namespace llvm diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp index 26d2b549136e43..e25d036cdf1e8c 100644 --- a/llvm/lib/Support/FormatVariadic.cpp +++ b/llvm/lib/Support/FormatVariadic.cpp @@ -25,8 +25,8 @@ static std::optional translateLocChar(char C) { LLVM_BUILTIN_UNREACHABLE; } -static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad) { +bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, + size_t &Align, char &Pad) { Where = AlignStyle::Right; Align = 0; Pad = ' '; @@ -35,7 +35,8 @@ static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, if (Spec.size() > 1) { // A maximum of 2 characters at the beginning can be used for something - // other than the width. + // other + // than the width. // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...] // contains the width. // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width. @@ -54,7 +55,8 @@ static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, return !Failed; } -static std::optional parseReplacementItem(StringRef Spec) { +std::optional +formatv_object_base::parseReplacementItem(StringRef Spec) { StringRef RepString = Spec.trim("{}"); // If the replacement sequence does not start with a non-negative integer, @@ -80,14 +82,15 @@ static std::optional parseReplacementItem(StringRef Spec) { RepString = StringRef(); } RepString = RepString.trim(); - assert(RepString.empty() && - "Unexpected characters found in replacement string!"); + if (!RepString.empty()) { + assert(false && "Unexpected characters found in replacement string!"); + } return ReplacementItem{Spec, Index, Align, Where, Pad, Options}; } -static std::pair -splitLiteralAndReplacement(StringRef Fmt) { +std::pair +formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) { while (!Fmt.empty()) { // Everything up until the first brace is a literal. if (Fmt.front() != '{') { @@ -140,77 +143,15 @@ splitLiteralAndReplacement(StringRef Fmt) { return std::make_pair(ReplacementItem{Fmt}, StringRef()); } -#ifndef NDEBUG -#define ENABLE_VALIDATION 1 -#else -#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode. -#endif - SmallVector -formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs, - bool Validate) { +formatv_object_base::parseFormatString(StringRef Fmt) { SmallVector Replacements; - -#if ENABLE_VALIDATION - const StringRef SavedFmtStr = Fmt; - size_t NumExpectedArgs = 0; -#endif - + ReplacementItem I; while (!Fmt.empty()) { - ReplacementItem I; std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt); if (I.Type != ReplacementType::Empty) Replacements.push_back(I); -#if ENABLE_VALIDATION - if (I.Type == ReplacementType::Format) - NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1); -#endif - } - -#if ENABLE_VALIDATION - if (!Validate) - return Replacements; - - // Perform additional validation. Verify that the number of arguments matches - // the number of replacement indices and that there are no holes in the - // replacement indices. - - // When validation fails, return an array of replacement items that - // will print an error message as the outout of this formatv() (used when - // validation is enabled in release mode). - auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) { - return SmallVector{ - ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg), - ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)}; - }; - - if (NumExpectedArgs != NumArgs) { - errs() << formatv( - "Expected {0} Args, but got {1} for format string '{2}'\n", - NumExpectedArgs, NumArgs, SavedFmtStr); - assert(0 && "Invalid formatv() call"); - return getErrorReplacements("Unexpected number of arguments"); - } - - // Find the number of unique indices seen. All replacement indices - // are < NumExpectedArgs. - SmallVector Indices(NumExpectedArgs); - size_t Count = 0; - for (const ReplacementItem &I : Replacements) { - if (I.Type != ReplacementType::Format || Indices[I.Index]) - continue; - Indices[I.Index] = true; - ++Count; - } - - if (Count != NumExpectedArgs) { - errs() << formatv( - "Replacement field indices cannot have holes for format string '{0}'\n", - SavedFmtStr); - assert(0 && "Invalid format string"); - return getErrorReplacements("Replacement indices have holes"); } -#endif // ENABLE_VALIDATION return Replacements; } diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index 6ee0d924867419..a78b25c53d7e43 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -9,11 +9,9 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatAdapters.h" -#include "gmock/gmock.h" #include "gtest/gtest.h" using namespace llvm; -using ::testing::HasSubstr; // Compile-time tests templates in the detail namespace. namespace { @@ -37,19 +35,14 @@ struct NoFormat {}; static_assert(uses_missing_provider::value, ""); } -// Helper to parse format string with no validation. -static SmallVector parseFormatString(StringRef Fmt) { - return formatv_object_base::parseFormatString(Fmt, 0, false); -} - TEST(FormatVariadicTest, EmptyFormatString) { - auto Replacements = parseFormatString(""); + auto Replacements = formatv_object_base::parseFormatString(""); EXPECT_EQ(0U, Replacements.size()); } TEST(FormatVariadicTest, NoReplacements) { const StringRef kFormatString = "This is a test"; - auto Replacements = parseFormatString(kFormatString); + auto Replacements = formatv_object_base::parseFormatString(kFormatString); ASSERT_EQ(1U, Replacements.size()); EXPECT_EQ(kFormatString, Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -57,25 +50,25 @@ TEST(FormatVariadicTest, NoReplacements) { TEST(FormatVariadicTest, EscapedBrace) { // {{ should be replaced with { - auto Replacements = parseFormatString("{{"); + auto Replacements = formatv_object_base::parseFormatString("{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // An even number N of braces should be replaced with N/2 braces. - Replacements = parseFormatString("{{{{{{"); + Replacements = formatv_object_base::parseFormatString("{{{{{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{{{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = parseFormatString("}"); + Replacements = formatv_object_base::parseFormatString("}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = parseFormatString("}}}"); + Replacements = formatv_object_base::parseFormatString("}}}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}}}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -83,14 +76,14 @@ TEST(FormatVariadicTest, EscapedBrace) { TEST(FormatVariadicTest, ValidReplacementSequence) { // 1. Simple replacement - parameter index only - auto Replacements = parseFormatString("{0}"); + auto Replacements = formatv_object_base::parseFormatString("{0}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); EXPECT_EQ(0u, Replacements[0].Align); EXPECT_EQ("", Replacements[0].Options); - Replacements = parseFormatString("{1}"); + Replacements = formatv_object_base::parseFormatString("{1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(1u, Replacements[0].Index); @@ -99,7 +92,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 2. Parameter index with right alignment - Replacements = parseFormatString("{0,3}"); + Replacements = formatv_object_base::parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -108,7 +101,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 3. And left alignment - Replacements = parseFormatString("{0,-3}"); + Replacements = formatv_object_base::parseFormatString("{0,-3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -117,7 +110,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. And center alignment - Replacements = parseFormatString("{0,=3}"); + Replacements = formatv_object_base::parseFormatString("{0,=3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -126,7 +119,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. Parameter index with option string - Replacements = parseFormatString("{0:foo}"); + Replacements = formatv_object_base::parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -135,7 +128,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 5. Parameter index with alignment before option string - Replacements = parseFormatString("{0,-3:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,-3:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -144,7 +137,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 7. Parameter indices, options, and alignment can all have whitespace. - Replacements = parseFormatString("{ 0, -3 : foo }"); + Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -154,7 +147,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { // 8. Everything after the first option specifier is part of the style, even // if it contains another option specifier. - Replacements = parseFormatString("{0:0:1}"); + Replacements = formatv_object_base::parseFormatString("{0:0:1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0:0:1", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -164,7 +157,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0:1", Replacements[0].Options); // 9. Custom padding character - Replacements = parseFormatString("{0,p+4:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,p+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -175,7 +168,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // Format string special characters are allowed as padding character - Replacements = parseFormatString("{0,-+4:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,-+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -185,7 +178,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('-', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = parseFormatString("{0,+-4:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,+-4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -195,7 +188,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('+', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = parseFormatString("{0,==4:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,==4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,==4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -205,7 +198,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('=', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = parseFormatString("{0,:=4:foo}"); + Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,:=4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -218,7 +211,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { TEST(FormatVariadicTest, DefaultReplacementValues) { // 2. If options string is missing, it defaults to empty. - auto Replacements = parseFormatString("{0,3}"); + auto Replacements = formatv_object_base::parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -226,7 +219,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // Including if the colon is present but contains no text. - Replacements = parseFormatString("{0,3:}"); + Replacements = formatv_object_base::parseFormatString("{0,3:}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -234,7 +227,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // 3. If alignment is missing, it defaults to 0, right, space - Replacements = parseFormatString("{0:foo}"); + Replacements = formatv_object_base::parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); @@ -245,7 +238,8 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { } TEST(FormatVariadicTest, MultipleReplacements) { - auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}"); + auto Replacements = + formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}"); ASSERT_EQ(5u, Replacements.size()); // {0} EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 7016fe41ca75d0..82f8718fc556ad 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body, dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode; TypeSwitch(dir->getArg()) .Case([&](auto operand) { - body << formatv(false, parserCode, + body << formatv(parserCode, operand->getVar()->constraint.getCppType(), listName); }) .Default([&](auto operand) { - body << formatv(false, parserCode, "::mlir::Type", listName); + body << formatv(parserCode, "::mlir::Type", listName); }); } } else if (auto *dir = dyn_cast(element)) { From 67ffd1438379ee43f678f3e7752f4ec5f777cee4 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 29 Aug 2024 14:36:39 -0300 Subject: [PATCH 27/72] libcxx: [NFC] relax error expectation for clang diagnostics (#106591) This is a split-off from #96023, where this change has already been reviewed by libcxx maintainers. This will prevent that PR from triggering libcxx-ci from now on. --- libcxx/test/libcxx/type_traits/is_specialization.verify.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp index a798647d56ee1c..3593c2e095db91 100644 --- a/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp +++ b/libcxx/test/libcxx/type_traits/is_specialization.verify.cpp @@ -17,5 +17,5 @@ #include #include -// expected-error@+1 {{template template argument has different template parameters than its corresponding template template parameter}} +// expected-error-re@*:* {{{{could not match _Size against 'type-parameter-0-0'|different template parameters}}}} static_assert(!std::__is_specialization_v, std::array>); From 9ce4af5cadc24060f3c3674e01902d374afea983 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 29 Aug 2024 10:39:40 -0700 Subject: [PATCH 28/72] Revert "Revert "[Support] Validate number of arguments passed to formatv()"" (#106592) Reverts llvm/llvm-project#106589 The fix for bot failures caused by the reverted commit was committed already, so this revert is not needed. --- .../Checkers/StdLibraryFunctionsChecker.cpp | 5 +- llvm/benchmarks/CMakeLists.txt | 1 + llvm/benchmarks/FormatVariadicBM.cpp | 63 ++++++++++++++ llvm/include/llvm/Support/FormatVariadic.h | 39 +++++---- llvm/lib/Support/FormatVariadic.cpp | 85 ++++++++++++++++--- llvm/unittests/Support/FormatVariadicTest.cpp | 56 ++++++------ mlir/tools/mlir-tblgen/OpFormatGen.cpp | 4 +- 7 files changed, 195 insertions(+), 58 deletions(-) create mode 100644 llvm/benchmarks/FormatVariadicBM.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 8f4bd17afc8581..4f30b2a0e7e7da 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1401,7 +1401,10 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, ErrnoNote = llvm::formatv("After calling '{0}' {1}", FunctionName, ErrnoNote); } else { - CaseNote = llvm::formatv(Case.getNote().str().c_str(), FunctionName); + // Disable formatv() validation as the case note may not always have the + // {0} placeholder for function name. + CaseNote = + llvm::formatv(false, Case.getNote().str().c_str(), FunctionName); } const SVal RV = Call.getReturnValue(); diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index 713d4ccd3c5975..e3366e6f3ffe19 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -5,3 +5,4 @@ set(LLVM_LINK_COMPONENTS add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED) +add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp new file mode 100644 index 00000000000000..c03ead400d0d5c --- /dev/null +++ b/llvm/benchmarks/FormatVariadicBM.cpp @@ -0,0 +1,63 @@ +//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +using namespace llvm; +using namespace std; + +// Generate a list of format strings that have `NumReplacements` replacements +// by permuting the replacements and some literal text. +static vector getFormatStrings(int NumReplacements) { + vector Components; + for (int I = 0; I < NumReplacements; I++) + Components.push_back("{" + to_string(I) + "}"); + // Intersperse these with some other literal text (_). + const string_view Literal = "____"; + for (char C : Literal) + Components.push_back(string(1, C)); + + vector Formats; + do { + string Concat; + for (const string &C : Components) + Concat += C; + Formats.emplace_back(Concat); + } while (next_permutation(Components.begin(), Components.end())); + return Formats; +} + +// Generate the set of formats to exercise outside the benchmark code. +static const vector> Formats = { + getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), + getFormatStrings(4), getFormatStrings(5), +}; + +// Benchmark formatv() for a variety of format strings and 1-5 replacements. +static void BM_FormatVariadic(benchmark::State &state) { + for (auto _ : state) { + for (const string &Fmt : Formats[0]) + formatv(Fmt.c_str(), 1).str(); + for (const string &Fmt : Formats[1]) + formatv(Fmt.c_str(), 1, 2).str(); + for (const string &Fmt : Formats[2]) + formatv(Fmt.c_str(), 1, 2, 3).str(); + for (const string &Fmt : Formats[3]) + formatv(Fmt.c_str(), 1, 2, 3, 4).str(); + for (const string &Fmt : Formats[4]) + formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); + } +} + +BENCHMARK(BM_FormatVariadic); + +BENCHMARK_MAIN(); diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index 595f2cf559a428..f31ad70021579e 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -67,23 +67,20 @@ class formatv_object_base { protected: StringRef Fmt; ArrayRef Adapters; - - static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad); - - static std::pair - splitLiteralAndReplacement(StringRef Fmt); + bool Validate; formatv_object_base(StringRef Fmt, - ArrayRef Adapters) - : Fmt(Fmt), Adapters(Adapters) {} + ArrayRef Adapters, + bool Validate) + : Fmt(Fmt), Adapters(Adapters), Validate(Validate) {} formatv_object_base(formatv_object_base const &rhs) = delete; formatv_object_base(formatv_object_base &&rhs) = default; public: void format(raw_ostream &S) const { - for (auto &R : parseFormatString(Fmt)) { + const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate); + for (const auto &R : Replacements) { if (R.Type == ReplacementType::Empty) continue; if (R.Type == ReplacementType::Literal) { @@ -101,9 +98,10 @@ class formatv_object_base { Align.format(S, R.Options); } } - static SmallVector parseFormatString(StringRef Fmt); - static std::optional parseReplacementItem(StringRef Spec); + // Parse and optionally validate format string (in debug builds). + static SmallVector + parseFormatString(StringRef Fmt, size_t NumArgs, bool Validate); std::string str() const { std::string Result; @@ -149,8 +147,8 @@ template class formatv_object : public formatv_object_base { }; public: - formatv_object(StringRef Fmt, Tuple &&Params) - : formatv_object_base(Fmt, ParameterPointers), + formatv_object(StringRef Fmt, Tuple &&Params, bool Validate) + : formatv_object_base(Fmt, ParameterPointers, Validate), Parameters(std::move(Params)) { ParameterPointers = std::apply(create_adapters(), Parameters); } @@ -247,15 +245,22 @@ template class formatv_object : public formatv_object_base { // assertion. Otherwise, it will try to do something reasonable, but in general // the details of what that is are undefined. // + +// formatv() with validation enable/disable controlled by the first argument. template -inline auto formatv(const char *Fmt, Ts &&...Vals) +inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) -> formatv_object(Vals))...))> { using ParamTuple = decltype(std::make_tuple( support::detail::build_format_adapter(std::forward(Vals))...)); - return formatv_object( - Fmt, std::make_tuple(support::detail::build_format_adapter( - std::forward(Vals))...)); + auto Params = std::make_tuple( + support::detail::build_format_adapter(std::forward(Vals))...); + return formatv_object(Fmt, std::move(Params), Validate); +} + +// formatv() with validation enabled. +template inline auto formatv(const char *Fmt, Ts &&...Vals) { + return formatv(true, Fmt, std::forward(Vals)...); } } // end namespace llvm diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp index e25d036cdf1e8c..26d2b549136e43 100644 --- a/llvm/lib/Support/FormatVariadic.cpp +++ b/llvm/lib/Support/FormatVariadic.cpp @@ -25,8 +25,8 @@ static std::optional translateLocChar(char C) { LLVM_BUILTIN_UNREACHABLE; } -bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, - size_t &Align, char &Pad) { +static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where, + size_t &Align, char &Pad) { Where = AlignStyle::Right; Align = 0; Pad = ' '; @@ -35,8 +35,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, if (Spec.size() > 1) { // A maximum of 2 characters at the beginning can be used for something - // other - // than the width. + // other than the width. // If Spec[1] is a loc char, then Spec[0] is a pad char and Spec[2:...] // contains the width. // Otherwise, if Spec[0] is a loc char, then Spec[1:...] contains the width. @@ -55,8 +54,7 @@ bool formatv_object_base::consumeFieldLayout(StringRef &Spec, AlignStyle &Where, return !Failed; } -std::optional -formatv_object_base::parseReplacementItem(StringRef Spec) { +static std::optional parseReplacementItem(StringRef Spec) { StringRef RepString = Spec.trim("{}"); // If the replacement sequence does not start with a non-negative integer, @@ -82,15 +80,14 @@ formatv_object_base::parseReplacementItem(StringRef Spec) { RepString = StringRef(); } RepString = RepString.trim(); - if (!RepString.empty()) { - assert(false && "Unexpected characters found in replacement string!"); - } + assert(RepString.empty() && + "Unexpected characters found in replacement string!"); return ReplacementItem{Spec, Index, Align, Where, Pad, Options}; } -std::pair -formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) { +static std::pair +splitLiteralAndReplacement(StringRef Fmt) { while (!Fmt.empty()) { // Everything up until the first brace is a literal. if (Fmt.front() != '{') { @@ -143,15 +140,77 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) { return std::make_pair(ReplacementItem{Fmt}, StringRef()); } +#ifndef NDEBUG +#define ENABLE_VALIDATION 1 +#else +#define ENABLE_VALIDATION 0 // Conveniently enable validation in release mode. +#endif + SmallVector -formatv_object_base::parseFormatString(StringRef Fmt) { +formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs, + bool Validate) { SmallVector Replacements; - ReplacementItem I; + +#if ENABLE_VALIDATION + const StringRef SavedFmtStr = Fmt; + size_t NumExpectedArgs = 0; +#endif + while (!Fmt.empty()) { + ReplacementItem I; std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt); if (I.Type != ReplacementType::Empty) Replacements.push_back(I); +#if ENABLE_VALIDATION + if (I.Type == ReplacementType::Format) + NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1); +#endif + } + +#if ENABLE_VALIDATION + if (!Validate) + return Replacements; + + // Perform additional validation. Verify that the number of arguments matches + // the number of replacement indices and that there are no holes in the + // replacement indices. + + // When validation fails, return an array of replacement items that + // will print an error message as the outout of this formatv() (used when + // validation is enabled in release mode). + auto getErrorReplacements = [SavedFmtStr](StringLiteral ErrorMsg) { + return SmallVector{ + ReplacementItem("Invalid formatv() call: "), ReplacementItem(ErrorMsg), + ReplacementItem(" for format string: "), ReplacementItem(SavedFmtStr)}; + }; + + if (NumExpectedArgs != NumArgs) { + errs() << formatv( + "Expected {0} Args, but got {1} for format string '{2}'\n", + NumExpectedArgs, NumArgs, SavedFmtStr); + assert(0 && "Invalid formatv() call"); + return getErrorReplacements("Unexpected number of arguments"); + } + + // Find the number of unique indices seen. All replacement indices + // are < NumExpectedArgs. + SmallVector Indices(NumExpectedArgs); + size_t Count = 0; + for (const ReplacementItem &I : Replacements) { + if (I.Type != ReplacementType::Format || Indices[I.Index]) + continue; + Indices[I.Index] = true; + ++Count; + } + + if (Count != NumExpectedArgs) { + errs() << formatv( + "Replacement field indices cannot have holes for format string '{0}'\n", + SavedFmtStr); + assert(0 && "Invalid format string"); + return getErrorReplacements("Replacement indices have holes"); } +#endif // ENABLE_VALIDATION return Replacements; } diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index a78b25c53d7e43..6ee0d924867419 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -9,9 +9,11 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatAdapters.h" +#include "gmock/gmock.h" #include "gtest/gtest.h" using namespace llvm; +using ::testing::HasSubstr; // Compile-time tests templates in the detail namespace. namespace { @@ -35,14 +37,19 @@ struct NoFormat {}; static_assert(uses_missing_provider::value, ""); } +// Helper to parse format string with no validation. +static SmallVector parseFormatString(StringRef Fmt) { + return formatv_object_base::parseFormatString(Fmt, 0, false); +} + TEST(FormatVariadicTest, EmptyFormatString) { - auto Replacements = formatv_object_base::parseFormatString(""); + auto Replacements = parseFormatString(""); EXPECT_EQ(0U, Replacements.size()); } TEST(FormatVariadicTest, NoReplacements) { const StringRef kFormatString = "This is a test"; - auto Replacements = formatv_object_base::parseFormatString(kFormatString); + auto Replacements = parseFormatString(kFormatString); ASSERT_EQ(1U, Replacements.size()); EXPECT_EQ(kFormatString, Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -50,25 +57,25 @@ TEST(FormatVariadicTest, NoReplacements) { TEST(FormatVariadicTest, EscapedBrace) { // {{ should be replaced with { - auto Replacements = formatv_object_base::parseFormatString("{{"); + auto Replacements = parseFormatString("{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // An even number N of braces should be replaced with N/2 braces. - Replacements = formatv_object_base::parseFormatString("{{{{{{"); + Replacements = parseFormatString("{{{{{{"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("{{{", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = formatv_object_base::parseFormatString("}"); + Replacements = parseFormatString("}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); // } does not require doubling up. - Replacements = formatv_object_base::parseFormatString("}}}"); + Replacements = parseFormatString("}}}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("}}}", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Literal, Replacements[0].Type); @@ -76,14 +83,14 @@ TEST(FormatVariadicTest, EscapedBrace) { TEST(FormatVariadicTest, ValidReplacementSequence) { // 1. Simple replacement - parameter index only - auto Replacements = formatv_object_base::parseFormatString("{0}"); + auto Replacements = parseFormatString("{0}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); EXPECT_EQ(0u, Replacements[0].Align); EXPECT_EQ("", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{1}"); + Replacements = parseFormatString("{1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(1u, Replacements[0].Index); @@ -92,7 +99,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 2. Parameter index with right alignment - Replacements = formatv_object_base::parseFormatString("{0,3}"); + Replacements = parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -101,7 +108,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 3. And left alignment - Replacements = formatv_object_base::parseFormatString("{0,-3}"); + Replacements = parseFormatString("{0,-3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -110,7 +117,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. And center alignment - Replacements = formatv_object_base::parseFormatString("{0,=3}"); + Replacements = parseFormatString("{0,=3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -119,7 +126,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("", Replacements[0].Options); // 4. Parameter index with option string - Replacements = formatv_object_base::parseFormatString("{0:foo}"); + Replacements = parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -128,7 +135,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 5. Parameter index with alignment before option string - Replacements = formatv_object_base::parseFormatString("{0,-3:foo}"); + Replacements = parseFormatString("{0,-3:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -137,7 +144,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // 7. Parameter indices, options, and alignment can all have whitespace. - Replacements = formatv_object_base::parseFormatString("{ 0, -3 : foo }"); + Replacements = parseFormatString("{ 0, -3 : foo }"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -147,7 +154,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { // 8. Everything after the first option specifier is part of the style, even // if it contains another option specifier. - Replacements = formatv_object_base::parseFormatString("{0:0:1}"); + Replacements = parseFormatString("{0:0:1}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0:0:1", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -157,7 +164,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("0:1", Replacements[0].Options); // 9. Custom padding character - Replacements = formatv_object_base::parseFormatString("{0,p+4:foo}"); + Replacements = parseFormatString("{0,p+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,p+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -168,7 +175,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ("foo", Replacements[0].Options); // Format string special characters are allowed as padding character - Replacements = formatv_object_base::parseFormatString("{0,-+4:foo}"); + Replacements = parseFormatString("{0,-+4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,-+4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -178,7 +185,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('-', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,+-4:foo}"); + Replacements = parseFormatString("{0,+-4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,+-4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -188,7 +195,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('+', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,==4:foo}"); + Replacements = parseFormatString("{0,==4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,==4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -198,7 +205,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ('=', Replacements[0].Pad); EXPECT_EQ("foo", Replacements[0].Options); - Replacements = formatv_object_base::parseFormatString("{0,:=4:foo}"); + Replacements = parseFormatString("{0,:=4:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ("0,:=4:foo", Replacements[0].Spec); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); @@ -211,7 +218,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { TEST(FormatVariadicTest, DefaultReplacementValues) { // 2. If options string is missing, it defaults to empty. - auto Replacements = formatv_object_base::parseFormatString("{0,3}"); + auto Replacements = parseFormatString("{0,3}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -219,7 +226,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // Including if the colon is present but contains no text. - Replacements = formatv_object_base::parseFormatString("{0,3:}"); + Replacements = parseFormatString("{0,3:}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(0u, Replacements[0].Index); @@ -227,7 +234,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { EXPECT_EQ("", Replacements[0].Options); // 3. If alignment is missing, it defaults to 0, right, space - Replacements = formatv_object_base::parseFormatString("{0:foo}"); + Replacements = parseFormatString("{0:foo}"); ASSERT_EQ(1u, Replacements.size()); EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); EXPECT_EQ(AlignStyle::Right, Replacements[0].Where); @@ -238,8 +245,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) { } TEST(FormatVariadicTest, MultipleReplacements) { - auto Replacements = - formatv_object_base::parseFormatString("{0} {1:foo}-{2,-3:bar}"); + auto Replacements = parseFormatString("{0} {1:foo}-{2,-3:bar}"); ASSERT_EQ(5u, Replacements.size()); // {0} EXPECT_EQ(ReplacementType::Format, Replacements[0].Type); diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 82f8718fc556ad..7016fe41ca75d0 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1654,12 +1654,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body, dir->shouldBeQualified() ? qualifiedTypeParserCode : typeParserCode; TypeSwitch(dir->getArg()) .Case([&](auto operand) { - body << formatv(parserCode, + body << formatv(false, parserCode, operand->getVar()->constraint.getCppType(), listName); }) .Default([&](auto operand) { - body << formatv(parserCode, "::mlir::Type", listName); + body << formatv(false, parserCode, "::mlir::Type", listName); }); } } else if (auto *dir = dyn_cast(element)) { From 9a58b12fe7bf54c9433ec89bae2a2d6cfe489e75 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 29 Aug 2024 18:41:15 +0100 Subject: [PATCH 29/72] [ExtendLifetimes][NFC] Add explicit triple to new fake-use tests Several tests for the new fake use intrinsic are failing on NVPTX buildbots due to relying on behaviour for their expected triple; this commit adds that triple to each of them to prevent failures. Fixes commit 3d08ade (#86149). Example buildbot failures: https://lab.llvm.org/buildbot/#/builders/160/builds/4175 https://lab.llvm.org/buildbot/#/builders/180/builds/4173 --- llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir | 2 +- llvm/test/CodeGen/X86/fake-use-scheduler.mir | 2 +- llvm/test/CodeGen/X86/fake-use-tailcall.ll | 2 +- llvm/test/CodeGen/X86/fake-use-vector2.ll | 2 +- llvm/test/CodeGen/X86/fake-use-zero-length.ll | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir index 7eb8915f26a80f..6c2cb0e55222b2 100644 --- a/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir +++ b/llvm/test/CodeGen/MIR/X86/fake-use-tailcall.mir @@ -22,7 +22,7 @@ # return temp; # } # -# RUN: llc -run-pass=codegenprepare -o - %s | FileCheck %s +# RUN: llc -run-pass=codegenprepare -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s # # CHECK: define{{.*}}foo # CHECK: if.then: diff --git a/llvm/test/CodeGen/X86/fake-use-scheduler.mir b/llvm/test/CodeGen/X86/fake-use-scheduler.mir index 7e55f1d79aa7b6..8b82c4ed2485dc 100644 --- a/llvm/test/CodeGen/X86/fake-use-scheduler.mir +++ b/llvm/test/CodeGen/X86/fake-use-scheduler.mir @@ -1,5 +1,5 @@ # Prevent the machine scheduler from moving instructions past FAKE_USE. -# RUN: llc -run-pass machine-scheduler -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s +# RUN: llc -run-pass machine-scheduler -mtriple=x86_64-unknown-linux -debug-only=machine-scheduler 2>&1 -o - %s | FileCheck %s # REQUIRES: asserts # # We make sure that, beginning with the first FAKE_USE instruction, diff --git a/llvm/test/CodeGen/X86/fake-use-tailcall.ll b/llvm/test/CodeGen/X86/fake-use-tailcall.ll index 10bb22e1b564ab..67c28dcf1301c4 100644 --- a/llvm/test/CodeGen/X86/fake-use-tailcall.ll +++ b/llvm/test/CodeGen/X86/fake-use-tailcall.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -stop-after=finalize-isel - | FileCheck %s --implicit-check-not FAKE_USE +; RUN: llc < %s -stop-after=finalize-isel -mtriple=x86_64-unknown-linux - | FileCheck %s --implicit-check-not FAKE_USE ; Fake uses following tail calls should be pulled in front ; of the TCRETURN instruction. Fake uses using something defined by ; the tail call or after it should be suppressed. diff --git a/llvm/test/CodeGen/X86/fake-use-vector2.ll b/llvm/test/CodeGen/X86/fake-use-vector2.ll index 6f2d3a5566dc67..190197615775a9 100644 --- a/llvm/test/CodeGen/X86/fake-use-vector2.ll +++ b/llvm/test/CodeGen/X86/fake-use-vector2.ll @@ -1,4 +1,4 @@ -; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s +; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s ; ; Make sure we can split vectors that are used as operands of FAKE_USE. diff --git a/llvm/test/CodeGen/X86/fake-use-zero-length.ll b/llvm/test/CodeGen/X86/fake-use-zero-length.ll index e8c6791b8edff2..e3bdd2659dd913 100644 --- a/llvm/test/CodeGen/X86/fake-use-zero-length.ll +++ b/llvm/test/CodeGen/X86/fake-use-zero-length.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -stop-after=finalize-isel | FileCheck %s --implicit-check-not=FAKE_USE +; RUN: llc < %s -stop-after=finalize-isel -mtriple=x86_64-unknown-linux | FileCheck %s --implicit-check-not=FAKE_USE ; ; Make sure SelectionDAG does not crash handling fake uses of zero-length arrays ; and structs. Check also that they are not propagated. From 4bc7c74240b6f13bf421c1fef0155370b23d9fc8 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 11:00:31 -0700 Subject: [PATCH 30/72] [SLP] Extract isIdentityOrder to common routine [probably NFC] (#106582) This isn't quite just code motion as the four different versions we had of this routine differed in whether they ignored the "size" marker used to represent undef. I doubt this matters in practice, but it is a functional change. --------- Co-authored-by: Alexey Bataev --- .../Transforms/Vectorize/SLPVectorizer.cpp | 52 +++++++------------ 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 81811e0a4d9295..e77db3cbd81fe5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1364,6 +1364,18 @@ class BoUpSLP { /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + /// Does this non-empty order represent an identity order? Identity + /// should be represented as an empty order, so this is used to + /// decide if we can canonicalize a computed order. Undef elements + /// (represented as size) are ignored. + bool isIdentityOrder(ArrayRef Order) const { + assert(!Order.empty() && "expected non-empty order"); + const unsigned Sz = Order.size(); + return all_of(enumerate(Order), [&](const auto &P) { + return P.value() == P.index() || P.value() == Sz; + }); + } + /// Checks if the specified gather tree entry \p TE can be represented as a /// shuffled vector entry + (possibly) permutation with other gathers. It /// implements the checks only for possibly ordered scalars (Loads, @@ -5256,12 +5268,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } return I1 < I2; }; - auto IsIdentityOrder = [](const OrdersType &Order) { - for (unsigned Idx : seq(0, Order.size())) - if (Idx != Order[Idx]) - return false; - return true; - }; DenseMap PhiToId; SmallVector Phis(TE.Scalars.size()); std::iota(Phis.begin(), Phis.end(), 0); @@ -5271,7 +5277,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { stable_sort(Phis, PHICompare); for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) ResOrder[Id] = PhiToId[Phis[Id]]; - if (IsIdentityOrder(ResOrder)) + if (isIdentityOrder(ResOrder)) return std::nullopt; // No need to reorder. return std::move(ResOrder); } @@ -5565,19 +5571,12 @@ void BoUpSLP::reorderTopToBottom() { } if (OrdersUses.empty()) continue; - auto IsIdentityOrder = [](ArrayRef Order) { - const unsigned Sz = Order.size(); - for (unsigned Idx : seq(0, Sz)) - if (Idx != Order[Idx] && Order[Idx] != Sz) - return false; - return true; - }; // Choose the most used order. unsigned IdentityCnt = 0; unsigned FilledIdentityCnt = 0; OrdersType IdentityOrder(VF, VF); for (auto &Pair : OrdersUses) { - if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + if (Pair.first.empty() || isIdentityOrder(Pair.first)) { if (!Pair.first.empty()) FilledIdentityCnt += Pair.second; IdentityCnt += Pair.second; @@ -5593,7 +5592,7 @@ void BoUpSLP::reorderTopToBottom() { if (Cnt < Pair.second || (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && Cnt == Pair.second && !BestOrder.empty() && - IsIdentityOrder(BestOrder))) { + isIdentityOrder(BestOrder))) { combineOrders(Pair.first, BestOrder); BestOrder = Pair.first; Cnt = Pair.second; @@ -5602,7 +5601,7 @@ void BoUpSLP::reorderTopToBottom() { } } // Set order of the user node. - if (IsIdentityOrder(BestOrder)) + if (isIdentityOrder(BestOrder)) continue; fixupOrderingIndices(BestOrder); SmallVector Mask; @@ -5891,19 +5890,12 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { OrderedEntries.remove(Op.second); continue; } - auto IsIdentityOrder = [](ArrayRef Order) { - const unsigned Sz = Order.size(); - for (unsigned Idx : seq(0, Sz)) - if (Idx != Order[Idx] && Order[Idx] != Sz) - return false; - return true; - }; // Choose the most used order. unsigned IdentityCnt = 0; unsigned VF = Data.second.front().second->getVectorFactor(); OrdersType IdentityOrder(VF, VF); for (auto &Pair : OrdersUses) { - if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + if (Pair.first.empty() || isIdentityOrder(Pair.first)) { IdentityCnt += Pair.second; combineOrders(IdentityOrder, Pair.first); } @@ -5923,7 +5915,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } } // Set order of the user node. - if (IsIdentityOrder(BestOrder)) { + if (isIdentityOrder(BestOrder)) { for (const std::pair &Op : Data.second) OrderedEntries.remove(Op.second); continue; @@ -6186,13 +6178,7 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in // reorderTopToBottom() and reorderBottomToTop(), so we are following the // same convention here. - auto IsIdentityOrder = [](const OrdersType &Order) { - for (unsigned Idx : seq(0, Order.size())) - if (Idx != Order[Idx]) - return false; - return true; - }; - if (IsIdentityOrder(ReorderIndices)) + if (isIdentityOrder(ReorderIndices)) ReorderIndices.clear(); return true; From fd0dbc7f4d8a5900535aa87569fbc385b7c50ba6 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Thu, 29 Aug 2024 11:02:43 -0700 Subject: [PATCH 31/72] [DirectX] add enum for PSV resource type/kind/flag. (#106227) Add ResourceType, ResourceKind and ResourceFlag enum class for PSV resource. This is for #103275 --- llvm/include/llvm/BinaryFormat/DXContainer.h | 25 +++++++++- .../BinaryFormat/DXContainerConstants.def | 46 +++++++++++++++++++ .../include/llvm/ObjectYAML/DXContainerYAML.h | 3 ++ llvm/lib/BinaryFormat/DXContainer.cpp | 30 ++++++++++++ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 18 ++++++++ .../DXContainer/DomainMaskVectors.yaml | 4 +- .../DXContainer/PSVv0-amplification.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-compute.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-domain.yaml | 8 ++-- .../DXContainer/PSVv0-geometry.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-hull.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-mesh.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-pixel.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv0-vertex.yaml | 8 ++-- .../DXContainer/PSVv1-amplification.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-compute.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-domain.yaml | 8 ++-- .../DXContainer/PSVv1-geometry.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-hull.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-mesh.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-pixel.yaml | 8 ++-- .../ObjectYAML/DXContainer/PSVv1-vertex.yaml | 8 ++-- .../DXContainer/PSVv2-amplification.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-compute.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-domain.yaml | 24 +++++----- .../DXContainer/PSVv2-geometry.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-hull.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-mesh.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-pixel.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv2-vertex.yaml | 24 +++++----- .../DXContainer/PSVv3-amplification.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-compute.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-domain.yaml | 24 +++++----- .../DXContainer/PSVv3-geometry.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-hull.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-mesh.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-pixel.yaml | 24 +++++----- .../ObjectYAML/DXContainer/PSVv3-vertex.yaml | 24 +++++----- .../ObjectYAML/DXContainer/SigElements.yaml | 4 +- llvm/unittests/Object/DXContainerTest.cpp | 40 ++++++++-------- 40 files changed, 400 insertions(+), 282 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 013431faff2728..a4cc814549c95b 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -299,6 +299,27 @@ enum class InterpolationMode : uint8_t { ArrayRef> getInterpolationModes(); +#define RESOURCE_TYPE(Val, Enum) Enum = Val, +enum class ResourceType : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getResourceTypes(); + +#define RESOURCE_KIND(Val, Enum) Enum = Val, +enum class ResourceKind : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getResourceKinds(); + +#define RESOURCE_FLAG(Val, Enum) Enum = Val, +enum class ResourceFlag : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getResourceFlags(); + namespace v0 { struct RuntimeInfo { PipelinePSVInfo StageInfo; @@ -315,7 +336,7 @@ struct RuntimeInfo { }; struct ResourceBindInfo { - uint32_t Type; + ResourceType Type; uint32_t Space; uint32_t LowerBound; uint32_t UpperBound; @@ -417,7 +438,7 @@ struct RuntimeInfo : public v1::RuntimeInfo { }; struct ResourceBindInfo : public v0::ResourceBindInfo { - uint32_t Kind; + ResourceKind Kind; uint32_t Flags; void swapBytes() { diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 62dc573555198b..4111cecb018bb3 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -150,6 +150,52 @@ INTERPOLATION_MODE(8, Invalid) #undef INTERPOLATION_MODE #endif // INTERPOLATION_MODE +#ifdef RESOURCE_TYPE +RESOURCE_TYPE(0, Invalid) +RESOURCE_TYPE(1, Sampler) +RESOURCE_TYPE(2, CBV) +RESOURCE_TYPE(3, SRVTyped) +RESOURCE_TYPE(4, SRVRaw) +RESOURCE_TYPE(5, SRVStructured) +RESOURCE_TYPE(6, UAVTyped) +RESOURCE_TYPE(7, UAVRaw) +RESOURCE_TYPE(8, UAVStructured) +RESOURCE_TYPE(9, UAVStructuredWithCounter) + +#undef RESOURCE_TYPE +#endif // RESOURCE_TYPE + +#ifdef RESOURCE_KIND +RESOURCE_KIND(0, Invalid) +RESOURCE_KIND(1, Texture1D) +RESOURCE_KIND(2, Texture2D) +RESOURCE_KIND(3, Texture2DMS) +RESOURCE_KIND(4, Texture3D) +RESOURCE_KIND(5, TextureCube) +RESOURCE_KIND(6, Texture1DArray) +RESOURCE_KIND(7, Texture2DArray) +RESOURCE_KIND(8, Texture2DMSArray) +RESOURCE_KIND(9, TextureCubeArray) +RESOURCE_KIND(10, TypedBuffer) +RESOURCE_KIND(11, RawBuffer) +RESOURCE_KIND(12, StructuredBuffer) +RESOURCE_KIND(13, CBuffer) +RESOURCE_KIND(14, Sampler) +RESOURCE_KIND(15, TBuffer) +RESOURCE_KIND(16, RTAccelerationStructure) +RESOURCE_KIND(17, FeedbackTexture2D) +RESOURCE_KIND(18, FeedbackTexture2DArray) + +#undef RESOURCE_KIND +#endif // RESOURCE_KIND + +#ifdef RESOURCE_FLAG +RESOURCE_FLAG(0, None) +RESOURCE_FLAG(1, UsedByAtomic64) + +#undef RESOURCE_FLAG +#endif // RESOURCE_FLAG + #ifdef D3D_SYSTEM_VALUE D3D_SYSTEM_VALUE(0, Undefined) diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 9c4d9e19f11ba3..e432359b7bbd07 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -176,6 +176,9 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceType) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceFlag) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision) diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 9c0e657b069697..790947cc729c0b 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -89,3 +89,33 @@ static const EnumEntry InterpolationModeNames[] = { ArrayRef> PSV::getInterpolationModes() { return ArrayRef(InterpolationModeNames); } + +#define RESOURCE_TYPE(Val, Enum) {#Enum, PSV::ResourceType::Enum}, + +static const EnumEntry ResourceTypeNames[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> PSV::getResourceTypes() { + return ArrayRef(ResourceTypeNames); +} + +#define RESOURCE_KIND(Val, Enum) {#Enum, PSV::ResourceKind::Enum}, + +static const EnumEntry ResourceKindNames[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> PSV::getResourceKinds() { + return ArrayRef(ResourceKindNames); +} + +#define RESOURCE_FLAG(Val, Enum) {#Enum, PSV::ResourceFlag::Enum}, + +static const EnumEntry ResourceFlagNames[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> PSV::getResourceFlags() { + return ArrayRef(ResourceFlagNames); +} diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 38063670aee6e8..21a966d5abd132 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -254,6 +254,24 @@ void ScalarEnumerationTraits::enumeration( IO.enumCase(Value, E.Name.str().c_str(), E.Value); } +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::PSV::ResourceType &Value) { + for (const auto &E : dxbc::PSV::getResourceTypes()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::PSV::ResourceKind &Value) { + for (const auto &E : dxbc::PSV::getResourceKinds()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::PSV::ResourceFlag &Value) { + for (const auto &E : dxbc::PSV::getResourceFlags()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + void ScalarEnumerationTraits::enumeration( IO &IO, dxbc::D3DSystemValue &Value) { for (const auto &E : dxbc::getD3DSystemValues()) diff --git a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml index 713fbc61e094b5..f3cfa90d1cf901 100644 --- a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml +++ b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml @@ -70,11 +70,11 @@ Parts: NumThreadsZ: 0 ResourceStride: 24 Resources: - - Type: 2 + - Type: CBV Space: 0 LowerBound: 0 UpperBound: 0 - Kind: 13 + Kind: CBuffer Flags: 0 SigInputElements: - Name: AAA_HSFoo diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml index d15bfadda41f07..3597b684fa032a 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-amplification.yaml @@ -19,11 +19,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -48,11 +48,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml index 7e9f2fbd8b54de..4f8e60b780c560 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-compute.yaml @@ -18,11 +18,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -46,11 +46,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml index db2aee954b3466..fb8d148c286343 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-domain.yaml @@ -21,11 +21,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -52,11 +52,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml index 5509ac669e2d6b..cd59c6a10d1a3e 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-geometry.yaml @@ -22,11 +22,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -54,11 +54,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml index cd60f2b192b2e2..a672f9260516a4 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-hull.yaml @@ -22,11 +22,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -54,11 +54,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml index 07fb656c5b72a7..07fee2ff65e5bf 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-mesh.yaml @@ -23,11 +23,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -56,11 +56,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml index c7f956e5740cca..6bf18f340c3399 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-pixel.yaml @@ -20,11 +20,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -50,11 +50,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml index 6df9169b73e2f5..e0690fba0e8c4c 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv0-vertex.yaml @@ -19,11 +19,11 @@ Parts: MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -48,11 +48,11 @@ Parts: # CHECK-NEXT: MaximumWaveLaneCount: 4294967295 # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml index 982235549cddc6..beb85ee9828207 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-amplification.yaml @@ -22,11 +22,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -62,11 +62,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml index 629d45c65a2081..6c90fbb206c6af 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-compute.yaml @@ -21,11 +21,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -60,11 +60,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml index 941ec16544a2df..28a4884d1228f3 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-domain.yaml @@ -25,11 +25,11 @@ Parts: SigOutputVectors: [ 0, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -69,11 +69,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 0, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml index a666cc4464d457..2884fd75e73d56 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-geometry.yaml @@ -26,11 +26,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -70,11 +70,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml index c0f0f41e2318bd..0e71276ad1c16f 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-hull.yaml @@ -26,11 +26,11 @@ Parts: SigOutputVectors: [ 0, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -71,11 +71,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 0, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml index f981cb99cb968d..1af47f95c5e723 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-mesh.yaml @@ -28,11 +28,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -74,11 +74,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml index a7b6804fe5fe6c..156e83f655e4f1 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-pixel.yaml @@ -23,11 +23,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -64,11 +64,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml index a9590ba7704040..020e2345c6eec2 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv1-vertex.yaml @@ -22,11 +22,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 @@ -62,11 +62,11 @@ Parts: # CHECK-NEXT: SigOutputVectors: [ 8, 16, 32, 64 ] # CHECK-NEXT: ResourceStride: 16 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml index c1ad560f463e1e..8bae742b573919 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml @@ -25,18 +25,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -72,18 +72,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml index dc0ac3af9aa34b..74eb2b86ad01b2 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml @@ -24,18 +24,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -70,18 +70,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml index 03e23b06d05e47..38f81bd93d67cf 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml @@ -28,18 +28,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -79,18 +79,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml index b4a5efd2276c52..99fdbbb7c9edaf 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml @@ -29,18 +29,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -80,18 +80,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml index a1c87343ee915b..de8af95dbcbd89 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml @@ -29,18 +29,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -81,18 +81,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml index 6155a3e2354bce..78fc077348f42a 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml @@ -31,18 +31,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -84,18 +84,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml index 3fdd7be8fe7de9..ebe1e51faff3f8 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml @@ -26,18 +26,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -74,18 +74,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml index eb77fb1b10d77a..2bca2f211136b2 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml @@ -25,18 +25,18 @@ Parts: NumThreadsZ: 2048 ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -72,18 +72,18 @@ Parts: # CHECK-NEXT: NumThreadsZ: 2048 # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml index 09885bd529f05f..9e31d40ec7c1b4 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml @@ -26,18 +26,18 @@ Parts: EntryName: ASEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -74,18 +74,18 @@ Parts: # CHECK-NEXT: EntryName: ASEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml index ee6fb112c77222..530a8597cb6498 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml @@ -25,18 +25,18 @@ Parts: EntryName: CSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -72,18 +72,18 @@ Parts: # CHECK-NEXT: EntryName: CSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml index dd367deae88e47..a71ab67633eb6f 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml @@ -29,18 +29,18 @@ Parts: EntryName: DSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -81,18 +81,18 @@ Parts: # CHECK-NEXT: EntryName: DSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml index 4c7680b63b02b6..db530253c6a745 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml @@ -30,18 +30,18 @@ Parts: EntryName: GSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -82,18 +82,18 @@ Parts: # CHECK-NEXT: EntryName: GSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml index 3bbad8a9b0ee62..3e3ba493e98450 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml @@ -30,18 +30,18 @@ Parts: EntryName: HSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -83,18 +83,18 @@ Parts: # CHECK-NEXT: EntryName: HSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml index c5ea1fcf078087..57bbcecfa1796b 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml @@ -32,18 +32,18 @@ Parts: EntryName: MSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -86,18 +86,18 @@ Parts: # CHECK-NEXT: EntryName: MSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml index b28d5ec8074d85..c94c234142a34b 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml @@ -27,18 +27,18 @@ Parts: EntryName: PSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -76,18 +76,18 @@ Parts: # CHECK-NEXT: EntryName: PSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml index d1fb55839931ca..697fa870c2257c 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml @@ -26,18 +26,18 @@ Parts: EntryName: VSEntry ResourceStride: 24 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - Kind: 5 - Flags: 6 - - Type: 128 + Kind: TextureCube + Flags: 0 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 - Kind: 65535 - Flags: 16776960 + Kind: Invalid + Flags: 0 SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -74,18 +74,18 @@ Parts: # CHECK-NEXT: EntryName: VSEntry # CHECK-NEXT: ResourceStride: 24 # CHECK-NEXT: Resources: -# CHECK-NEXT: - Type: 1 +# CHECK-NEXT: - Type: Sampler # CHECK-NEXT: Space: 2 # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 -# CHECK-NEXT: Kind: 5 -# CHECK-NEXT: Flags: 6 -# CHECK-NEXT: - Type: 128 +# CHECK-NEXT: Kind: TextureCube +# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 -# CHECK-NEXT: Kind: 65535 -# CHECK-NEXT: Flags: 16776960 +# CHECK-NEXT: Kind: Invalid +# CHECK-NEXT: Flags: 0 # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/SigElements.yaml b/llvm/test/ObjectYAML/DXContainer/SigElements.yaml index 47a18a6487c975..e7c72761901ebc 100644 --- a/llvm/test/ObjectYAML/DXContainer/SigElements.yaml +++ b/llvm/test/ObjectYAML/DXContainer/SigElements.yaml @@ -23,11 +23,11 @@ Parts: SigOutputVectors: [ 8, 16, 32, 64 ] ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 2 LowerBound: 3 UpperBound: 4 - - Type: 128 + - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 9da6543c520c74..5a2c852d6aef97 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -266,15 +266,15 @@ TEST(DXCFile, PSVResourceIterators) { MaximumWaveLaneCount: 4294967295 ResourceStride: 16 Resources: - - Type: 1 + - Type: Sampler Space: 1 LowerBound: 1 UpperBound: 1 - - Type: 2 + - Type: CBV Space: 2 LowerBound: 2 UpperBound: 2 - - Type: 3 + - Type: SRVTyped Space: 3 LowerBound: 3 UpperBound: 3 @@ -308,13 +308,13 @@ TEST(DXCFile, PSVResourceIterators) { dxbc::PSV::v2::ResourceBindInfo Binding; Binding = *It; - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Flags, 0u); ++It; Binding = *It; - EXPECT_EQ(Binding.Type, 2u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::CBV); EXPECT_EQ(Binding.Flags, 0u); --It; @@ -322,25 +322,25 @@ TEST(DXCFile, PSVResourceIterators) { EXPECT_TRUE(It == PSVInfo->getResources().begin()); - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Flags, 0u); --It; Binding = *It; - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Flags, 0u); ++It; Binding = *It; - EXPECT_EQ(Binding.Type, 2u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::CBV); EXPECT_EQ(Binding.Flags, 0u); ++It; Binding = *It; - EXPECT_EQ(Binding.Type, 3u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVTyped); EXPECT_EQ(Binding.Flags, 0u); EXPECT_FALSE(It == PSVInfo->getResources().end()); @@ -351,7 +351,7 @@ TEST(DXCFile, PSVResourceIterators) { EXPECT_TRUE(It == PSVInfo->getResources().end()); EXPECT_FALSE(It != PSVInfo->getResources().end()); - EXPECT_EQ(Binding.Type, 0u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid); EXPECT_EQ(Binding.Flags, 0u); { @@ -361,7 +361,7 @@ TEST(DXCFile, PSVResourceIterators) { EXPECT_TRUE(Old == PSVInfo->getResources().end()); EXPECT_FALSE(Old != PSVInfo->getResources().end()); - EXPECT_EQ(Binding.Type, 0u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid); EXPECT_EQ(Binding.Flags, 0u); } @@ -369,7 +369,7 @@ TEST(DXCFile, PSVResourceIterators) { EXPECT_TRUE(It == PSVInfo->getResources().end()); - EXPECT_EQ(Binding.Type, 0u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid); EXPECT_EQ(Binding.Flags, 0u); { @@ -377,13 +377,13 @@ TEST(DXCFile, PSVResourceIterators) { Binding = *Old; EXPECT_TRUE(Old == PSVInfo->getResources().end()); - EXPECT_EQ(Binding.Type, 0u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid); EXPECT_EQ(Binding.Flags, 0u); } Binding = *It; - EXPECT_EQ(Binding.Type, 3u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVTyped); EXPECT_EQ(Binding.Flags, 0u); } @@ -587,7 +587,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { dxbc::PSV::v2::ResourceBindInfo Binding; Binding = *It; - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Space, 2u); EXPECT_EQ(Binding.LowerBound, 3u); EXPECT_EQ(Binding.UpperBound, 4u); @@ -595,7 +595,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { ++It; Binding = *It; - EXPECT_EQ(Binding.Type, 5u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVStructured); EXPECT_EQ(Binding.Space, 6u); EXPECT_EQ(Binding.LowerBound, 7u); EXPECT_EQ(Binding.UpperBound, 8u); @@ -605,7 +605,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { EXPECT_TRUE(It == PSVInfo->getResources().begin()); - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Space, 2u); EXPECT_EQ(Binding.LowerBound, 3u); EXPECT_EQ(Binding.UpperBound, 4u); @@ -613,7 +613,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { --It; Binding = *It; - EXPECT_EQ(Binding.Type, 1u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Sampler); EXPECT_EQ(Binding.Space, 2u); EXPECT_EQ(Binding.LowerBound, 3u); EXPECT_EQ(Binding.UpperBound, 4u); @@ -621,7 +621,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { ++It; Binding = *It; - EXPECT_EQ(Binding.Type, 5u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::SRVStructured); EXPECT_EQ(Binding.Space, 6u); EXPECT_EQ(Binding.LowerBound, 7u); EXPECT_EQ(Binding.UpperBound, 8u);; @@ -635,7 +635,7 @@ TEST(DXCFile, PSVResourceIteratorsStride) { EXPECT_TRUE(It == PSVInfo->getResources().end()); EXPECT_FALSE(It != PSVInfo->getResources().end()); - EXPECT_EQ(Binding.Type, 0u); + EXPECT_EQ(Binding.Type, dxbc::PSV::ResourceType::Invalid); EXPECT_EQ(Binding.Flags, 0u); } From 1ace91f925ad87c3e5eb836ad58fdffe60c4aea6 Mon Sep 17 00:00:00 2001 From: Job Henandez Lara Date: Thu, 29 Aug 2024 11:14:18 -0700 Subject: [PATCH 32/72] [libc][math] Add performance tests for fmul and fmull. (#106262) --- .../BinaryOpSingleOutputPerf.h | 50 +++++++++++-------- .../math/performance_testing/CMakeLists.txt | 22 ++++++++ .../math/performance_testing/fmod_perf.cpp | 2 +- .../math/performance_testing/fmodf16_perf.cpp | 4 +- .../math/performance_testing/fmodf_perf.cpp | 2 +- .../math/performance_testing/fmul_perf.cpp | 23 +++++++++ .../math/performance_testing/fmull_perf.cpp | 23 +++++++++ .../math/performance_testing/hypot_perf.cpp | 2 +- .../math/performance_testing/hypotf_perf.cpp | 2 +- .../max_min_funcs_perf.cpp | 32 ++++++------ .../misc_basic_ops_perf.cpp | 6 +-- 11 files changed, 121 insertions(+), 47 deletions(-) create mode 100644 libc/test/src/math/performance_testing/fmul_perf.cpp create mode 100644 libc/test/src/math/performance_testing/fmull_perf.cpp diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h index 63d9768e21899b..98a1813bd7b54a 100644 --- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h +++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h @@ -16,15 +16,15 @@ namespace LIBC_NAMESPACE_DECL { namespace testing { - -template class BinaryOpSingleOutputPerf { - using FPBits = fputil::FPBits; +template +class BinaryOpSingleOutputPerf { + using FPBits = fputil::FPBits; using StorageType = typename FPBits::StorageType; static constexpr StorageType UIntMax = cpp::numeric_limits::max(); public: - typedef T Func(T, T); + typedef OutputType Func(InputType, InputType); static void run_perf_in_range(Func myFunc, Func otherFunc, StorageType startingBit, StorageType endingBit, @@ -33,7 +33,7 @@ template class BinaryOpSingleOutputPerf { N = cpp::min(N, static_cast(endingBit - startingBit)); auto runner = [=](Func func) { - [[maybe_unused]] volatile T result; + [[maybe_unused]] volatile OutputType result; if (endingBit < startingBit) { return; } @@ -42,8 +42,8 @@ template class BinaryOpSingleOutputPerf { for (size_t i = 0; i < rounds; i++) { for (StorageType bitsX = startingBit, bitsY = endingBit;; bitsX += step, bitsY -= step) { - T x = FPBits(bitsX).get_val(); - T y = FPBits(bitsY).get_val(); + InputType x = FPBits(bitsX).get_val(); + InputType y = FPBits(bitsY).get_val(); result = func(x, y); if (endingBit - bitsX < step) { break; @@ -94,10 +94,11 @@ template class BinaryOpSingleOutputPerf { 1'000'001, rounds, log); log << "\n Performance tests with inputs in normal range with exponents " "close to each other:\n"; - run_perf_in_range(myFunc, otherFunc, - /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(), - /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), - 1'000'001, rounds, log); + run_perf_in_range( + myFunc, otherFunc, + /* startingBit= */ FPBits(OutputType(0x1.0p-10)).uintval(), + /* endingBit= */ FPBits(OutputType(0x1.0p+10)).uintval(), 1'000'001, + rounds, log); } static void run_diff(Func myFunc, Func otherFunc, const char *logFile) { @@ -115,8 +116,10 @@ template class BinaryOpSingleOutputPerf { log << "\n Diff tests with inputs in normal range with exponents " "close to each other:\n"; diffCount += run_diff_in_range( - myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(), - /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 10'000'001, log); + myFunc, otherFunc, + /* startingBit= */ FPBits(OutputType(0x1.0p-10)).uintval(), + /* endingBit= */ FPBits(OutputType(0x1.0p+10)).uintval(), 10'000'001, + log); log << "Total number of differing results: " << diffCount << '\n'; } @@ -125,18 +128,21 @@ template class BinaryOpSingleOutputPerf { } // namespace testing } // namespace LIBC_NAMESPACE_DECL -#define BINARY_OP_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename) \ +#define BINARY_OP_SINGLE_OUTPUT_PERF(OutputType, InputType, myFunc, otherFunc, \ + filename) \ int main() { \ - LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf::run_perf( \ - &myFunc, &otherFunc, 1, filename); \ + LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf< \ + OutputType, InputType>::run_perf(&myFunc, &otherFunc, 1, filename); \ return 0; \ } -#define BINARY_OP_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds, \ - filename) \ +#define BINARY_OP_SINGLE_OUTPUT_PERF_EX(OutputType, InputType, myFunc, \ + otherFunc, rounds, filename) \ { \ - LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf::run_perf( \ - &myFunc, &otherFunc, rounds, filename); \ - LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf::run_perf( \ - &myFunc, &otherFunc, rounds, filename); \ + LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf< \ + OutputType, InputType>::run_perf(&myFunc, &otherFunc, rounds, \ + filename); \ + LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf< \ + OutputType, InputType>::run_perf(&myFunc, &otherFunc, rounds, \ + filename); \ } diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt index 8e529ca09ed797..ed1b03f3493c7d 100644 --- a/libc/test/src/math/performance_testing/CMakeLists.txt +++ b/libc/test/src/math/performance_testing/CMakeLists.txt @@ -476,3 +476,25 @@ add_perf_binary( COMPILE_OPTIONS -fno-builtin ) + +add_perf_binary( + fmul_perf + SRCS + fmul_perf.cpp + DEPENDS + .binary_op_single_output_diff + libc.src.math.fmul + COMPILE_OPTIONS + -fno-builtin +) + +add_perf_binary( + fmull_perf + SRCS + fmull_perf.cpp + DEPENDS + .binary_op_single_output_diff + libc.src.math.fmull + COMPILE_OPTIONS + -fno-builtin +) diff --git a/libc/test/src/math/performance_testing/fmod_perf.cpp b/libc/test/src/math/performance_testing/fmod_perf.cpp index fa9b4c6b41287b..75a4242034226b 100644 --- a/libc/test/src/math/performance_testing/fmod_perf.cpp +++ b/libc/test/src/math/performance_testing/fmod_perf.cpp @@ -12,5 +12,5 @@ #include -BINARY_OP_SINGLE_OUTPUT_PERF(double, LIBC_NAMESPACE::fmod, ::fmod, +BINARY_OP_SINGLE_OUTPUT_PERF(double, double, LIBC_NAMESPACE::fmod, ::fmod, "fmod_perf.log") diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp index ff01fa6ca5870e..062bc2da05adf9 100644 --- a/libc/test/src/math/performance_testing/fmodf16_perf.cpp +++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp @@ -16,11 +16,11 @@ #define FMOD_FUNC(U) (LIBC_NAMESPACE::fputil::generic::FMod::eval) int main() { - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t), + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint32_t), 5000, "fmodf16_u16_vs_u32_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t), + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint64_t), 5000, "fmodf16_u16_vs_u64_perf.log") return 0; diff --git a/libc/test/src/math/performance_testing/fmodf_perf.cpp b/libc/test/src/math/performance_testing/fmodf_perf.cpp index f13f02e2439da3..b4f37ef25e676f 100644 --- a/libc/test/src/math/performance_testing/fmodf_perf.cpp +++ b/libc/test/src/math/performance_testing/fmodf_perf.cpp @@ -12,5 +12,5 @@ #include -BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::fmodf, ::fmodf, +BINARY_OP_SINGLE_OUTPUT_PERF(float, float, LIBC_NAMESPACE::fmodf, ::fmodf, "fmodf_perf.log") diff --git a/libc/test/src/math/performance_testing/fmul_perf.cpp b/libc/test/src/math/performance_testing/fmul_perf.cpp new file mode 100644 index 00000000000000..a215405eb6aa5d --- /dev/null +++ b/libc/test/src/math/performance_testing/fmul_perf.cpp @@ -0,0 +1,23 @@ +//===-- Performance test for the fmul function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BinaryOpSingleOutputPerf.h" +#include "src/math/fmul.h" + +static constexpr size_t DOUBLE_ROUNDS = 40; + +float fmul_placeholder_binary(double x, double y) { + return static_cast(x * y); +} + +int main() { + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, double, LIBC_NAMESPACE::fmul, + fmul_placeholder_binary, DOUBLE_ROUNDS, + "fmul_perf.log") + return 0; +} diff --git a/libc/test/src/math/performance_testing/fmull_perf.cpp b/libc/test/src/math/performance_testing/fmull_perf.cpp new file mode 100644 index 00000000000000..058e10288dbde8 --- /dev/null +++ b/libc/test/src/math/performance_testing/fmull_perf.cpp @@ -0,0 +1,23 @@ +//===-- Performance test for the fmull function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BinaryOpSingleOutputPerf.h" +#include "src/math/fmull.h" + +static constexpr size_t LONG_DOUBLE_ROUNDS = 40; + +float fmull_placeholder_binary(long double x, long double y) { + return static_cast(x * y); +} + +int main() { + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, long double, LIBC_NAMESPACE::fmull, + fmull_placeholder_binary, LONG_DOUBLE_ROUNDS, + "fmull_perf.log") + return 0; +} diff --git a/libc/test/src/math/performance_testing/hypot_perf.cpp b/libc/test/src/math/performance_testing/hypot_perf.cpp index 393697b7540330..04a493ff0e0258 100644 --- a/libc/test/src/math/performance_testing/hypot_perf.cpp +++ b/libc/test/src/math/performance_testing/hypot_perf.cpp @@ -12,5 +12,5 @@ #include -BINARY_OP_SINGLE_OUTPUT_PERF(double, LIBC_NAMESPACE::hypot, ::hypot, +BINARY_OP_SINGLE_OUTPUT_PERF(double, double, LIBC_NAMESPACE::hypot, ::hypot, "hypot_perf.log") diff --git a/libc/test/src/math/performance_testing/hypotf_perf.cpp b/libc/test/src/math/performance_testing/hypotf_perf.cpp index f711729377dacf..8a42f792263c98 100644 --- a/libc/test/src/math/performance_testing/hypotf_perf.cpp +++ b/libc/test/src/math/performance_testing/hypotf_perf.cpp @@ -12,5 +12,5 @@ #include -BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::hypotf, ::hypotf, +BINARY_OP_SINGLE_OUTPUT_PERF(float, float, LIBC_NAMESPACE::hypotf, ::hypotf, "hypotf_perf.log") diff --git a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp index 9540112e69ea6a..b77268d107c587 100644 --- a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp +++ b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp @@ -35,39 +35,39 @@ float16 placeholder_binaryf16(float16 x, float16 y) { return x; } float placeholder_binaryf(float x, float y) { return x; } int main() { - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaxf16, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fmaxf16, placeholder_binaryf16, FLOAT16_ROUNDS, "fmaxf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminf16, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fminf16, placeholder_binaryf16, FLOAT16_ROUNDS, "fminf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximumf16, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fmaximumf16, placeholder_binaryf16, FLOAT16_ROUNDS, "fmaximumf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimumf16, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::fminimumf16, placeholder_binaryf16, FLOAT16_ROUNDS, "fminimumf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fmaximum_numf16, - placeholder_binaryf16, FLOAT16_ROUNDS, - "fmaximum_numf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fminimum_numf16, - placeholder_binaryf16, FLOAT16_ROUNDS, - "fminimum_numf16_perf.log") + BINARY_OP_SINGLE_OUTPUT_PERF_EX( + float16, float16, LIBC_NAMESPACE::fmaximum_numf16, placeholder_binaryf16, + FLOAT16_ROUNDS, "fmaximum_numf16_perf.log") + BINARY_OP_SINGLE_OUTPUT_PERF_EX( + float16, float16, LIBC_NAMESPACE::fminimum_numf16, placeholder_binaryf16, + FLOAT16_ROUNDS, "fminimum_numf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaxf, ::fmaxf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaxf, ::fmaxf, FLOAT_ROUNDS, "fmaxf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminf, ::fminf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminf, ::fminf, FLOAT_ROUNDS, "fminf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximumf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaximumf, placeholder_binaryf, FLOAT_ROUNDS, "fmaximumf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimumf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminimumf, placeholder_binaryf, FLOAT_ROUNDS, "fminimumf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fmaximum_numf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fmaximum_numf, placeholder_binaryf, FLOAT_ROUNDS, "fmaximum_numf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fminimum_numf, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::fminimum_numf, placeholder_binaryf, FLOAT_ROUNDS, "fminimum_numf_perf.log") diff --git a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp index ace1d21c62c325..9a4522c307ac76 100644 --- a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp +++ b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp @@ -28,14 +28,14 @@ int main() { SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16, placeholder_unaryf16, FLOAT16_ROUNDS, "fabsf16_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16, + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, float16, LIBC_NAMESPACE::copysignf16, placeholder_binaryf16, FLOAT16_ROUNDS, "copysignf16_perf.log") SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf, FLOAT_ROUNDS, "fabsf_perf.log") - BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf, - FLOAT_ROUNDS, "copysignf_perf.log") + BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, float, LIBC_NAMESPACE::copysignf, + copysignf, FLOAT_ROUNDS, "copysignf_perf.log") return 0; } From 0a41c8e7a050c837c609cbcbc8342024701cd14b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 29 Aug 2024 11:27:42 -0700 Subject: [PATCH 33/72] [flang][cuda] Avoid generating cuf.data_transfer in OpenACC region (#106435) `cuf.data_transfer` will be converted to runtime calls to cuda runtime api and these are not supported in device code. assignment in OpenACC region will be handled by the OpenACC code gen so we avoid to generate data transfer on them. --- flang/lib/Lower/Bridge.cpp | 10 ++++- flang/test/Lower/CUDA/cuda-data-transfer.cuf | 44 +++++++++++++++++++- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index c48daba8cf7fab..078e17bea55859 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4380,9 +4380,14 @@ class FirConverter : public Fortran::lower::AbstractConverter { // Check if the insertion point is currently in a device context. HostDevice // subprogram are not considered fully device context so it will return false // for it. - static bool isDeviceContext(fir::FirOpBuilder &builder) { + // If the insertion point is inside an OpenACC region op, it is considered + // device context. + static bool isCudaDeviceContext(fir::FirOpBuilder &builder) { if (builder.getRegion().getParentOfType()) return true; + if (builder.getRegion() + .getParentOfType()) + return true; if (auto funcOp = builder.getRegion().getParentOfType()) { if (auto cudaProcAttr = @@ -4401,7 +4406,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Location loc = getCurrentLocation(); fir::FirOpBuilder &builder = getFirOpBuilder(); - bool isInDeviceContext = isDeviceContext(builder); + bool isInDeviceContext = isCudaDeviceContext(builder); + bool isCUDATransfer = (Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs) || Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs)) && !isInDeviceContext; diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 42b37fb89e4ce2..f189bf9b621082 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -1,4 +1,4 @@ -! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s +! RUN: bbc -emit-hlfir -fopenacc -fcuda %s -o - | FileCheck %s ! Test CUDA Fortran data transfer using assignment statements. @@ -290,3 +290,45 @@ end subroutine ! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> ! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) ! CHECK: cuf.data_transfer %[[AHOST]]#1 to %[[ADEV]]#1, %[[SHAPE]] : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer} : !fir.ref>, !fir.ref> + +! Check that cuf.data_transfer are not generated within OpenACC region +subroutine sub16() + integer, parameter :: n = 10 + real, device :: adev(n) + real :: ahost(n) + real, managed :: b + integer :: i + + adev = ahost + !$acc parallel loop deviceptr(adev) + do i = 1, n + adev(i) = adev(i) + b + enddo + + !$acc kernels deviceptr(adev) + do i = 1, n + adev(i) = adev(i) + b + enddo + !$acc end kernels + + + !$acc serial deviceptr(adev) + do i = 1, n + adev(i) = adev(i) + b + enddo + !$acc end serial +end subroutine + +! CHECK-LABEL: func.func @_QPsub16() +! CHECK: cuf.data_transfer +! CHECK: acc.parallel +! CHECK-NOT: cuf.data_transfer +! CHECK: hlfir.assign + +! CHECK: acc.kernels +! CHECK-NOT: cuf.data_transfer +! CHECK: hlfir.assign + +! CHECK: acc.serial +! CHECK-NOT: cuf.data_transfer +! CHECK: hlfir.assign From 6421dcc0a978900091cc7aa8fa443746602cb442 Mon Sep 17 00:00:00 2001 From: Haopeng Liu <153236845+haopliu@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:28:49 -0700 Subject: [PATCH 34/72] [NFC] [DSE] Refactor DSE (#100956) Refactor DSE with MemoryDefWrapper and MemoryLocationWrapper. Normally, one MemoryDef accesses one MemoryLocation. With "initializes" attribute, one MemoryDef (like call instruction) could initialize multiple MemoryLocations. Refactor DSE as a preparation to apply "initializes" attribute in DSE in a follow-up PR (https://github.com/llvm/llvm-project/commit/58dd8a440343055b1a4929d72317218e912c16fd). --- .../Scalar/DeadStoreElimination.cpp | 367 ++++++++++-------- 1 file changed, 209 insertions(+), 158 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 992139a95a43d3..a37f295abbd31c 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -806,6 +806,34 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { return false; } +// A memory location wrapper that represents a MemoryLocation, `MemLoc`, +// defined by `MemDef`. +struct MemoryLocationWrapper { + MemoryLocationWrapper(MemoryLocation MemLoc, MemoryDef *MemDef) + : MemLoc(MemLoc), MemDef(MemDef) { + assert(MemLoc.Ptr && "MemLoc should be not null"); + UnderlyingObject = getUnderlyingObject(MemLoc.Ptr); + DefInst = MemDef->getMemoryInst(); + } + + MemoryLocation MemLoc; + const Value *UnderlyingObject; + MemoryDef *MemDef; + Instruction *DefInst; +}; + +// A memory def wrapper that represents a MemoryDef and the MemoryLocation(s) +// defined by this MemoryDef. +struct MemoryDefWrapper { + MemoryDefWrapper(MemoryDef *MemDef, std::optional MemLoc) { + DefInst = MemDef->getMemoryInst(); + if (MemLoc.has_value()) + DefinedLocation = MemoryLocationWrapper(*MemLoc, MemDef); + } + Instruction *DefInst; + std::optional DefinedLocation = std::nullopt; +}; + struct DSEState { Function &F; AliasAnalysis &AA; @@ -1119,6 +1147,15 @@ struct DSEState { return MemoryLocation::getOrNone(I); } + std::optional getLocForInst(Instruction *I) { + if (isMemTerminatorInst(I)) { + if (auto Loc = getLocForTerminator(I)) { + return Loc->first; + } + } + return getLocForWrite(I); + } + /// Assuming this instruction has a dead analyzable write, can we delete /// this instruction? bool isRemovable(Instruction *I) { @@ -2132,182 +2169,196 @@ struct DSEState { } return MadeChange; } -}; -static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, - DominatorTree &DT, PostDominatorTree &PDT, - const TargetLibraryInfo &TLI, - const LoopInfo &LI) { - bool MadeChange = false; + // Try to eliminate dead defs that access `KillingLocWrapper.MemLoc` and are + // killed by `KillingLocWrapper.MemDef`. Return whether + // any changes were made, and whether `KillingLocWrapper.DefInst` was deleted. + std::pair + eliminateDeadDefs(const MemoryLocationWrapper &KillingLocWrapper); - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); - // For each store: - for (unsigned I = 0; I < State.MemDefs.size(); I++) { - MemoryDef *KillingDef = State.MemDefs[I]; - if (State.SkipStores.count(KillingDef)) + // Try to eliminate dead defs killed by `KillingDefWrapper` and return the + // change state: whether make any change. + bool eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper); +}; + +std::pair +DSEState::eliminateDeadDefs(const MemoryLocationWrapper &KillingLocWrapper) { + bool Changed = false; + bool DeletedKillingLoc = false; + unsigned ScanLimit = MemorySSAScanLimit; + unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; + unsigned PartialLimit = MemorySSAPartialStoreLimit; + // Worklist of MemoryAccesses that may be killed by + // "KillingLocWrapper.MemDef". + SmallSetVector ToCheck; + // Track MemoryAccesses that have been deleted in the loop below, so we can + // skip them. Don't use SkipStores for this, which may contain reused + // MemoryAccess addresses. + SmallPtrSet Deleted; + [[maybe_unused]] unsigned OrigNumSkipStores = SkipStores.size(); + ToCheck.insert(KillingLocWrapper.MemDef->getDefiningAccess()); + + // Check if MemoryAccesses in the worklist are killed by + // "KillingLocWrapper.MemDef". + for (unsigned I = 0; I < ToCheck.size(); I++) { + MemoryAccess *Current = ToCheck[I]; + if (Deleted.contains(Current)) continue; - Instruction *KillingI = KillingDef->getMemoryInst(); + std::optional MaybeDeadAccess = getDomMemoryDef( + KillingLocWrapper.MemDef, Current, KillingLocWrapper.MemLoc, + KillingLocWrapper.UnderlyingObject, ScanLimit, WalkerStepLimit, + isMemTerminatorInst(KillingLocWrapper.DefInst), PartialLimit); - std::optional MaybeKillingLoc; - if (State.isMemTerminatorInst(KillingI)) { - if (auto KillingLoc = State.getLocForTerminator(KillingI)) - MaybeKillingLoc = KillingLoc->first; - } else { - MaybeKillingLoc = State.getLocForWrite(KillingI); + if (!MaybeDeadAccess) { + LLVM_DEBUG(dbgs() << " finished walk\n"); + continue; } - - if (!MaybeKillingLoc) { - LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " - << *KillingI << "\n"); + MemoryAccess *DeadAccess = *MaybeDeadAccess; + LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess); + if (isa(DeadAccess)) { + LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n"); + for (Value *V : cast(DeadAccess)->incoming_values()) { + MemoryAccess *IncomingAccess = cast(V); + BasicBlock *IncomingBlock = IncomingAccess->getBlock(); + BasicBlock *PhiBlock = DeadAccess->getBlock(); + + // We only consider incoming MemoryAccesses that come before the + // MemoryPhi. Otherwise we could discover candidates that do not + // strictly dominate our starting def. + if (PostOrderNumbers[IncomingBlock] > PostOrderNumbers[PhiBlock]) + ToCheck.insert(IncomingAccess); + } continue; } - MemoryLocation KillingLoc = *MaybeKillingLoc; - assert(KillingLoc.Ptr && "KillingLoc should not be null"); - const Value *KillingUndObj = getUnderlyingObject(KillingLoc.Ptr); - LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " - << *KillingDef << " (" << *KillingI << ")\n"); - - unsigned ScanLimit = MemorySSAScanLimit; - unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit; - unsigned PartialLimit = MemorySSAPartialStoreLimit; - // Worklist of MemoryAccesses that may be killed by KillingDef. - SmallSetVector ToCheck; - // Track MemoryAccesses that have been deleted in the loop below, so we can - // skip them. Don't use SkipStores for this, which may contain reused - // MemoryAccess addresses. - SmallPtrSet Deleted; - [[maybe_unused]] unsigned OrigNumSkipStores = State.SkipStores.size(); - ToCheck.insert(KillingDef->getDefiningAccess()); - - bool Shortend = false; - bool IsMemTerm = State.isMemTerminatorInst(KillingI); - // Check if MemoryAccesses in the worklist are killed by KillingDef. - for (unsigned I = 0; I < ToCheck.size(); I++) { - MemoryAccess *Current = ToCheck[I]; - if (Deleted.contains(Current)) - continue; - - std::optional MaybeDeadAccess = State.getDomMemoryDef( - KillingDef, Current, KillingLoc, KillingUndObj, ScanLimit, - WalkerStepLimit, IsMemTerm, PartialLimit); - - if (!MaybeDeadAccess) { - LLVM_DEBUG(dbgs() << " finished walk\n"); + MemoryDefWrapper DeadDefWrapper( + cast(DeadAccess), + getLocForInst(cast(DeadAccess)->getMemoryInst())); + MemoryLocationWrapper &DeadLocWrapper = *DeadDefWrapper.DefinedLocation; + LLVM_DEBUG(dbgs() << " (" << *DeadLocWrapper.DefInst << ")\n"); + ToCheck.insert(DeadLocWrapper.MemDef->getDefiningAccess()); + NumGetDomMemoryDefPassed++; + + if (!DebugCounter::shouldExecute(MemorySSACounter)) + continue; + if (isMemTerminatorInst(KillingLocWrapper.DefInst)) { + if (KillingLocWrapper.UnderlyingObject != DeadLocWrapper.UnderlyingObject) continue; + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DeadLocWrapper.DefInst << "\n KILLER: " + << *KillingLocWrapper.DefInst << '\n'); + deleteDeadInstruction(DeadLocWrapper.DefInst, &Deleted); + ++NumFastStores; + Changed = true; + } else { + // Check if DeadI overwrites KillingI. + int64_t KillingOffset = 0; + int64_t DeadOffset = 0; + OverwriteResult OR = + isOverwrite(KillingLocWrapper.DefInst, DeadLocWrapper.DefInst, + KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc, + KillingOffset, DeadOffset); + if (OR == OW_MaybePartial) { + auto Iter = + IOLs.insert(std::make_pair( + DeadLocWrapper.DefInst->getParent(), InstOverlapIntervalsTy())); + auto &IOL = Iter.first->second; + OR = isPartialOverwrite(KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc, + KillingOffset, DeadOffset, + DeadLocWrapper.DefInst, IOL); } - - MemoryAccess *DeadAccess = *MaybeDeadAccess; - LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess); - if (isa(DeadAccess)) { - LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n"); - for (Value *V : cast(DeadAccess)->incoming_values()) { - MemoryAccess *IncomingAccess = cast(V); - BasicBlock *IncomingBlock = IncomingAccess->getBlock(); - BasicBlock *PhiBlock = DeadAccess->getBlock(); - - // We only consider incoming MemoryAccesses that come before the - // MemoryPhi. Otherwise we could discover candidates that do not - // strictly dominate our starting def. - if (State.PostOrderNumbers[IncomingBlock] > - State.PostOrderNumbers[PhiBlock]) - ToCheck.insert(IncomingAccess); + if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) { + auto *DeadSI = dyn_cast(DeadLocWrapper.DefInst); + auto *KillingSI = dyn_cast(KillingLocWrapper.DefInst); + // We are re-using tryToMergePartialOverlappingStores, which requires + // DeadSI to dominate KillingSI. + // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. + if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { + if (Constant *Merged = tryToMergePartialOverlappingStores( + KillingSI, DeadSI, KillingOffset, DeadOffset, DL, BatchAA, + &DT)) { + + // Update stored value of earlier store to merged constant. + DeadSI->setOperand(0, Merged); + ++NumModifiedStores; + Changed = true; + DeletedKillingLoc = true; + + // Remove killing store and remove any outstanding overlap + // intervals for the updated store. + deleteDeadInstruction(KillingSI, &Deleted); + auto I = IOLs.find(DeadSI->getParent()); + if (I != IOLs.end()) + I->second.erase(DeadSI); + break; + } } - continue; } - auto *DeadDefAccess = cast(DeadAccess); - Instruction *DeadI = DeadDefAccess->getMemoryInst(); - LLVM_DEBUG(dbgs() << " (" << *DeadI << ")\n"); - ToCheck.insert(DeadDefAccess->getDefiningAccess()); - NumGetDomMemoryDefPassed++; - - if (!DebugCounter::shouldExecute(MemorySSACounter)) - continue; - - MemoryLocation DeadLoc = *State.getLocForWrite(DeadI); - - if (IsMemTerm) { - const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr); - if (KillingUndObj != DeadUndObj) - continue; - LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DeadI - << "\n KILLER: " << *KillingI << '\n'); - State.deleteDeadInstruction(DeadI, &Deleted); + if (OR == OW_Complete) { + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DeadLocWrapper.DefInst << "\n KILLER: " + << *KillingLocWrapper.DefInst << '\n'); + deleteDeadInstruction(DeadLocWrapper.DefInst, &Deleted); ++NumFastStores; - MadeChange = true; - } else { - // Check if DeadI overwrites KillingI. - int64_t KillingOffset = 0; - int64_t DeadOffset = 0; - OverwriteResult OR = State.isOverwrite( - KillingI, DeadI, KillingLoc, DeadLoc, KillingOffset, DeadOffset); - if (OR == OW_MaybePartial) { - auto Iter = State.IOLs.insert( - std::make_pair( - DeadI->getParent(), InstOverlapIntervalsTy())); - auto &IOL = Iter.first->second; - OR = isPartialOverwrite(KillingLoc, DeadLoc, KillingOffset, - DeadOffset, DeadI, IOL); - } - - if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) { - auto *DeadSI = dyn_cast(DeadI); - auto *KillingSI = dyn_cast(KillingI); - // We are re-using tryToMergePartialOverlappingStores, which requires - // DeadSI to dominate KillingSI. - // TODO: implement tryToMergeParialOverlappingStores using MemorySSA. - if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) { - if (Constant *Merged = tryToMergePartialOverlappingStores( - KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL, - State.BatchAA, &DT)) { - - // Update stored value of earlier store to merged constant. - DeadSI->setOperand(0, Merged); - ++NumModifiedStores; - MadeChange = true; - - Shortend = true; - // Remove killing store and remove any outstanding overlap - // intervals for the updated store. - State.deleteDeadInstruction(KillingSI, &Deleted); - auto I = State.IOLs.find(DeadSI->getParent()); - if (I != State.IOLs.end()) - I->second.erase(DeadSI); - break; - } - } - } - - if (OR == OW_Complete) { - LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DeadI - << "\n KILLER: " << *KillingI << '\n'); - State.deleteDeadInstruction(DeadI, &Deleted); - ++NumFastStores; - MadeChange = true; - } + Changed = true; } } + } - assert(State.SkipStores.size() - OrigNumSkipStores == Deleted.size() && - "SkipStores and Deleted out of sync?"); + assert(SkipStores.size() - OrigNumSkipStores == Deleted.size() && + "SkipStores and Deleted out of sync?"); - // Check if the store is a no-op. - if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) { - LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *KillingI - << '\n'); - State.deleteDeadInstruction(KillingI); - NumRedundantStores++; - MadeChange = true; - continue; - } + return {Changed, DeletedKillingLoc}; +} - // Can we form a calloc from a memset/malloc pair? - if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) { - LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n" - << " DEAD: " << *KillingI << '\n'); - State.deleteDeadInstruction(KillingI); - MadeChange = true; +bool DSEState::eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper) { + if (!KillingDefWrapper.DefinedLocation.has_value()) { + LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " + << *KillingDefWrapper.DefInst << "\n"); + return false; + } + + auto &KillingLocWrapper = *KillingDefWrapper.DefinedLocation; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " + << *KillingLocWrapper.MemDef << " (" + << *KillingLocWrapper.DefInst << ")\n"); + auto [Changed, DeletedKillingLoc] = eliminateDeadDefs(KillingLocWrapper); + + // Check if the store is a no-op. + if (!DeletedKillingLoc && storeIsNoop(KillingLocWrapper.MemDef, + KillingLocWrapper.UnderlyingObject)) { + LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " + << *KillingLocWrapper.DefInst << '\n'); + deleteDeadInstruction(KillingLocWrapper.DefInst); + NumRedundantStores++; + return true; + } + // Can we form a calloc from a memset/malloc pair? + if (!DeletedKillingLoc && + tryFoldIntoCalloc(KillingLocWrapper.MemDef, + KillingLocWrapper.UnderlyingObject)) { + LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n" + << " DEAD: " << *KillingLocWrapper.DefInst << '\n'); + deleteDeadInstruction(KillingLocWrapper.DefInst); + return true; + } + return Changed; +} + +static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI, + const LoopInfo &LI) { + bool MadeChange = false; + DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); + // For each store: + for (unsigned I = 0; I < State.MemDefs.size(); I++) { + MemoryDef *KillingDef = State.MemDefs[I]; + if (State.SkipStores.count(KillingDef)) continue; - } + + MemoryDefWrapper KillingDefWrapper( + KillingDef, State.getLocForInst(KillingDef->getMemoryInst())); + MadeChange |= State.eliminateDeadDefs(KillingDefWrapper); } if (EnablePartialOverwriteTracking) From 22ba3511087c85e3b1d4cad686f8d9c3aa6f8088 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 11:35:43 -0700 Subject: [PATCH 35/72] [RISCV][SLP] Test for <3 x Ty> reductions which require reordering These tests show a vectorizable reduction where the order of the reduction has been adjusted so that profitable vectorization requires a reordering of the computation. We currently have no reordering in SLP for non-power-of-two vectors, so this doesn't work. Note that due to reassociation performed in the standard pipeline, this is actually the canonical form for a reduction reaching SLP. --- .../SLPVectorizer/RISCV/vec3-base.ll | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 1ff286248c4a7a..c712acc4ea1893 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -554,6 +554,52 @@ define i32 @dot_product_i32(ptr %a, ptr %b) { ret i32 %add.1 } +; Same as above, except the reduction order has been perturbed. This +; is checking for our ability to reorder. +define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { +; CHECK-LABEL: @dot_product_i32_reorder( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 +; CHECK-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 +; CHECK-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret i32 [[ADD_1]] +; + %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 + %l.a.0 = load i32, ptr %gep.a.0, align 4 + %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1 + %l.a.1 = load i32, ptr %gep.a.1, align 4 + %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2 + %l.a.2 = load i32, ptr %gep.a.2, align 4 + + %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0 + %l.b.0 = load i32, ptr %gep.b.0, align 4 + %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1 + %l.b.1 = load i32, ptr %gep.b.1, align 4 + %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2 + %l.b.2 = load i32, ptr %gep.b.2, align 4 + + %mul.0 = mul nsw i32 %l.a.0, %l.b.0 + %mul.1 = mul nsw i32 %l.a.1, %l.b.1 + %mul.2 = mul nsw i32 %l.a.2, %l.b.2 + + %add.0 = add i32 %mul.1, %mul.0 + %add.1 = add i32 %add.0, %mul.2 + ret i32 %add.1 +} + define float @dot_product_fp32(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_fp32( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 @@ -604,6 +650,50 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ret float %add.1 } +; Same as above, except the reduction order has been perturbed. This +; is checking for our ability to reorder. +define float @dot_product_fp32_reorder(ptr %a, ptr %b) { +; CHECK-LABEL: @dot_product_fp32_reorder( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] +; + %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 + %l.a.0 = load float, ptr %gep.a.0, align 4 + %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1 + %l.a.1 = load float, ptr %gep.a.1, align 4 + %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2 + %l.a.2 = load float, ptr %gep.a.2, align 4 + + %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0 + %l.b.0 = load float, ptr %gep.b.0, align 4 + %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1 + %l.b.1 = load float, ptr %gep.b.1, align 4 + %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2 + %l.b.2 = load float, ptr %gep.b.2, align 4 + + %mul.0 = fmul fast float %l.a.0, %l.b.0 + %mul.1 = fmul fast float %l.a.1, %l.b.1 + %mul.2 = fmul fast float %l.a.2, %l.b.2 + + %add.0 = fadd fast float %mul.1, %mul.0 + %add.1 = fadd fast float %add.0, %mul.2 + ret float %add.1 +} + + define double @dot_product_fp64(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_fp64( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 From 26b0bef192be1a3adc250af460c2e728a1ca5a64 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 29 Aug 2024 11:43:58 -0700 Subject: [PATCH 36/72] AMDGPU: Use pattern to select instruction for intrinsic llvm.fptrunc.round (#105761) Use GCNPat instead of Custom Lowering to select instructions for intrinsic llvm.fptrunc.round. "SupportedRoundMode : TImmLeaf" is used as a predicate to select only when the rounding mode is supported. "as_hw_round_mode : SDNodeXForm" is developed to translate the round modes to the corresponding ones that hardware recognizes. --- .../Target/GlobalISel/SelectionDAGCompat.td | 1 + .../include/llvm/Target/TargetSelectionDAG.td | 5 + llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 5 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 - llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 - .../AMDGPU/AMDGPUInstructionSelector.cpp | 10 ++ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 33 +--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 1 - .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 28 +--- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 - llvm/lib/Target/AMDGPU/SIInstrInfo.td | 22 ++- llvm/lib/Target/AMDGPU/SIInstructions.td | 11 +- .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll | 7 +- .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 158 +++++++++--------- 16 files changed, 128 insertions(+), 161 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index e9dbdef9fe9e7c..72d155b483cf2b 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -161,6 +161,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 172deffbd31771..dd79002dcbdb48 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -158,6 +158,9 @@ def SDTFPUnaryOp : SDTypeProfile<1, 1, [ // fneg, fsqrt, etc def SDTFPRoundOp : SDTypeProfile<1, 1, [ // fpround SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1> ]>; +def SDTFPTruncRoundOp : SDTypeProfile<1, 2, [ + SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1> +]>; def SDTFPExtendOp : SDTypeProfile<1, 1, [ // fpextend SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1> ]>; @@ -552,6 +555,8 @@ def llround : SDNode<"ISD::LLROUND" , SDTFPToIntOp>; def lrint : SDNode<"ISD::LRINT" , SDTFPToIntOp>; def llrint : SDNode<"ISD::LLRINT" , SDTFPToIntOp>; +def fptrunc_round : SDNode<"ISD::FPTRUNC_ROUND", SDTFPTruncRoundOp>; + def fpround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>; def fpextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>; def fcopysign : SDNode<"ISD::FCOPYSIGN" , SDTFPSignOp>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 8bee84b8a87f27..118271af879937 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -297,8 +297,6 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; - class GISelSop2Pat < SDPatternOperator node, Instruction inst, @@ -419,3 +417,6 @@ def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameInde def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, GISDNodeXFormEquiv; + +def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, + GISDNodeXFormEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d24836b7eeb095..015dbc79ef9e4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5511,7 +5511,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) - NODE_NAME_CASE(FPTRUNC_ROUND) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 59f640ea99de3e..dd9d97bd593bda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -553,7 +553,6 @@ enum NodeType : unsigned { CONST_DATA_PTR, PC_ADD_REL_OFFSET, LDS, - FPTRUNC_ROUND, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 17071970ca4bfe..3fcb364fc2c536 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5594,6 +5594,16 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, MIB.addImm(ExpVal); } +void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 + // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 + // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 + // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 + MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); +} + bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { return TII.isInlineConstant(Imm); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 207cd67f0eda0e..068db5c1c14496 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -359,6 +359,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + bool isInlineImmediate(const APInt &Imm) const; bool isInlineImmediate(const APFloat &Imm) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 4fd917f5ea7fa8..3f6486d44f0ee5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1137,7 +1137,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) - .customFor({S16, S32}) + .legalFor({S16, S32}) .scalarize(0) .lower(); @@ -2179,8 +2179,6 @@ bool AMDGPULegalizerInfo::legalizeCustom( return legalizeCTLZ_CTTZ(MI, MRI, B); case TargetOpcode::G_CTLZ_ZERO_UNDEF: return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); - case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: - return legalizeFPTruncRound(MI, B); case TargetOpcode::G_STACKSAVE: return legalizeStackSave(MI, B); case TargetOpcode::G_GET_FPENV: @@ -7093,35 +7091,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, - MachineIRBuilder &B) const { - MachineRegisterInfo &MRI = *B.getMRI(); - Register Src = MI.getOperand(1).getReg(); - if (MRI.getType(Src) != LLT::scalar(32)) - return false; - - // Only support towardzero, tonearest, upward and downward. - int RoundMode = MI.getOperand(2).getImm(); - if (RoundMode != (int)RoundingMode::TowardZero && - RoundMode != (int)RoundingMode::NearestTiesToEven && - RoundMode != (int)RoundingMode::TowardPositive && - RoundMode != (int)RoundingMode::TowardNegative) - return false; - - // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 - // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 - // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 - // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 - unsigned HW_Mode = (RoundMode + 3) % 4; - B.buildInstr(AMDGPU::G_FPTRUNC_ROUND) - .addDef(MI.getOperand(0).getReg()) - .addUse(Src) - .addImm(HW_Mode); - - MI.eraseFromParent(); - return true; -} - bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const { const SITargetLowering *TLI = ST.getTargetLowering(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index db1c5874093a71..a815e87a7da35f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -212,7 +212,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; - bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 69a1936a11fe05..4737a322c255f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5255,7 +5255,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); break; } - case AMDGPU::G_FPTRUNC_ROUND: + case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: return getDefaultMappingVOP(MI); case AMDGPU::G_PREFETCH: OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1437f3d58b5e79..81b52935ddf397 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -598,7 +598,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP1 Actions. setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, - ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, + ISD::FSIN, ISD::FROUND}, MVT::f16, Custom); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); @@ -5797,8 +5797,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return lowerFP_ROUND(Op, DAG); - case ISD::FPTRUNC_ROUND: - return lowerFPTRUNC_ROUND(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: @@ -6648,30 +6646,6 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, DAG.getTargetConstant(0, DL, MVT::i32)); } -SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op, - SelectionDAG &DAG) const { - if (Op.getOperand(0)->getValueType(0) != MVT::f32) - return SDValue(); - - // Only support towardzero, tonearest, upward and downward. - int RoundMode = Op.getConstantOperandVal(1); - if (RoundMode != (int)RoundingMode::TowardZero && - RoundMode != (int)RoundingMode::NearestTiesToEven && - RoundMode != (int)RoundingMode::TowardPositive && - RoundMode != (int)RoundingMode::TowardNegative) - return SDValue(); - - // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 - // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 - // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 - // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 - unsigned HW_Mode = (RoundMode + 3) % 4; - SDLoc DL(Op); - SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32); - return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(), - Op->getOperand(0), RoundFlag); -} - SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index eed4b3e79cdeee..1f198a92c0fa6a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -145,7 +145,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 2b54429dc9a03f..faa8ca282e7ab8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -304,12 +304,6 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] >; -def SDTFPRoundModeOp : SDTypeProfile<1, 2, [ - SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1> -]>; - -def SIfptrunc_round : SDNode<"AMDGPUISD::FPTRUNC_ROUND", SDTFPRoundModeOp>; - //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -796,6 +790,22 @@ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; +def as_hw_round_mode : SDNodeXForm TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 + // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 + // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 + // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 + return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N), + MVT::i32); +}]>; + +def SupportedRoundMode : TImmLeaf; + class bitextract_imm : SDNodeXFormgetZExtValue(); unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 814d3182fb5df8..69e1b9a38324f2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -229,10 +229,12 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI< // in the ModeRegister pass. let Uses = [MODE, EXEC] in { def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VGPR_32:$src0, i32imm:$round), - [(set f16:$vdst, (SIfptrunc_round f32:$src0, i32:$round))]>; + (ins VGPR_32:$src0, i32imm:$round)>; } // End Uses = [MODE, EXEC] +def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))), + (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>; + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC], isConvergent = 1 in { @@ -4055,11 +4057,6 @@ def G_SI_CALL : AMDGPUGenericInstruction { let isConvergent = 1; } -def G_FPTRUNC_ROUND : AMDGPUGenericInstruction { - let OutOperandList = (outs type0:$vdst); - let InOperandList = (ins type1:$src0, untyped_imm_0:$round); - let hasSideEffects = 0; -} //============================================================================// // Dummy Instructions diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll index 4bcd0cf5e6a0e5..f1d5b07e832c48 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll @@ -1,9 +1,8 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefixes=SDAG-FAIL -; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=GISEL-FAIL +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) { -; SDAG-FAIL: LLVM ERROR: Cannot select -; GISEL-FAIL: unable to legalize instruction +; FAIL: LLVM ERROR: Cannot select %res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward") store half %res, ptr addrspace(1) %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll index 71d0ee524bab73..54ed6f1eb42820 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll @@ -176,8 +176,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: ; return to shader part epilog %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") ret <2 x half> %res @@ -197,8 +196,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: ; return to shader part epilog %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") ret <2 x half> %res @@ -228,23 +226,18 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x ; GISEL: ; %bb.0: ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 -; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v1, v7, 16, v6 -; GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 +; GISEL-NEXT: v_pack_b32_f16 v1, v1, v2 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 -; GISEL-NEXT: v_pk_add_f16 v0, v0, v1 -; GISEL-NEXT: v_pk_add_f16 v0, v2, v0 +; GISEL-NEXT: v_pk_add_f16 v0, v0, v3 +; GISEL-NEXT: v_pk_add_f16 v0, v1, v0 ; GISEL-NEXT: global_store_dword v[4:5], v0, off ; GISEL-NEXT: s_endpgm %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") @@ -295,31 +288,54 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> } define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) { -; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: v_mov_b32_e32 v3, s2 -; CHECK-NEXT: v_mov_b32_e32 v4, s1 -; CHECK-NEXT: v_mov_b32_e32 v5, s3 -; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v3 -; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CHECK-NEXT: v_cvt_f16_f32_e32 v7, v5 -; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CHECK-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; CHECK-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; CHECK-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 -; CHECK-NEXT: v_pk_add_f16 v2, v2, v5 -; CHECK-NEXT: v_pk_add_f16 v2, v3, v2 -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm +; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SDAG-NEXT: v_mov_b32_e32 v4, s1 +; SDAG-NEXT: v_mov_b32_e32 v5, s3 +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SDAG-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SDAG-NEXT: v_lshl_or_b32 v5, v7, 16, v6 +; SDAG-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 +; SDAG-NEXT: v_pk_add_f16 v2, v2, v5 +; SDAG-NEXT: v_pk_add_f16 v2, v3, v2 +; SDAG-NEXT: global_store_dword v[0:1], v2, off +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GISEL-NEXT: v_pack_b32_f16 v2, v2, v3 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GISEL-NEXT: v_pack_b32_f16 v5, v6, v7 +; GISEL-NEXT: v_pack_b32_f16 v3, v3, v4 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 +; GISEL-NEXT: v_pk_add_f16 v2, v2, v5 +; GISEL-NEXT: v_pk_add_f16 v2, v3, v2 +; GISEL-NEXT: global_store_dword v[0:1], v2, off +; GISEL-NEXT: s_endpgm %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward") %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward") @@ -344,8 +360,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GISEL-NEXT: ; return to shader part epilog %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward") @@ -367,8 +382,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GISEL-NEXT: ; return to shader part epilog %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward") @@ -391,13 +405,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> % ; GISEL: ; %bb.0: ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 +; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 ; GISEL-NEXT: ; return to shader part epilog %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward") ret <4 x half> %res @@ -419,13 +431,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 +; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 ; GISEL-NEXT: ; return to shader part epilog %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward") ret <4 x half> %res @@ -453,21 +463,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> % ; GISEL: ; %bb.0: ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 +; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 +; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5 +; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 ; GISEL-NEXT: ; return to shader part epilog %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward") ret <8 x half> %res @@ -495,21 +501,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 +; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3 +; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5 +; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7 ; GISEL-NEXT: ; return to shader part epilog %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward") ret <8 x half> %res From c1248c9d64e9210554571283980156b1d85cfe09 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Thu, 29 Aug 2024 21:45:46 +0300 Subject: [PATCH 37/72] [Clang] prevent assertion failure when converting vectors to int/float with invalid expressions (#105727) Fixes #105486 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExpr.cpp | 6 ++++++ clang/test/SemaCXX/vector.cpp | 14 ++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6df8bc64f1c7db..27f3d6e05da9f5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -326,6 +326,7 @@ Bug Fixes to C++ Support - Fix evaluation of the index of dependent pack indexing expressions/types specifiers (#GH105900) - Correctly handle subexpressions of an immediate invocation in the presence of implicit casts. (#GH105558) - Clang now correctly handles direct-list-initialization of a structured bindings from an array. (#GH31813) +- Fixed an assertion failure when converting vectors to int/float with invalid expressions. (#GH105486) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 95f53dfefbcc52..de316f30e9523d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9888,6 +9888,9 @@ static ExprResult convertVector(Expr *E, QualType ElementType, Sema &S) { /// IntTy without losing precision. static bool canConvertIntToOtherIntTy(Sema &S, ExprResult *Int, QualType OtherIntTy) { + if (Int->get()->containsErrors()) + return false; + QualType IntTy = Int->get()->getType().getUnqualifiedType(); // Reject cases where the value of the Int is unknown as that would @@ -9926,6 +9929,9 @@ static bool canConvertIntToOtherIntTy(Sema &S, ExprResult *Int, /// FloatTy without losing precision. static bool canConvertIntTyToFloatTy(Sema &S, ExprResult *Int, QualType FloatTy) { + if (Int->get()->containsErrors()) + return false; + QualType IntTy = Int->get()->getType().getUnqualifiedType(); // Determine if the integer constant can be expressed as a floating point diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp index 7c8ee89814e578..808bdb679b09cd 100644 --- a/clang/test/SemaCXX/vector.cpp +++ b/clang/test/SemaCXX/vector.cpp @@ -772,3 +772,17 @@ void test_scoped_enum_vector(EnumClass ea, v2u v2ua) { } #endif } + +namespace GH105486 { +__attribute__((__vector_size__(sizeof(double)))) double a; +double b = a - (long)(*0); // expected-error {{indirection requires pointer operand ('int' invalid)}} \ + // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}} + +__attribute__((__vector_size__(sizeof(long)))) long c; +long d = c - (long)(*0); // expected-error {{indirection requires pointer operand ('int' invalid)}} \ + // expected-error {{cannot initialize a variable of type 'long' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}} + +const long long e = *0; // expected-error {{indirection requires pointer operand ('int' invalid)}} +double f = a - e; // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}} +int h = c - e; // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}} +} From e9eaf19eb605c14bed7a0f76d206c13a8eaf842f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Ferenc=20Szab=C3=B3?= <30732159+dfszabo@users.noreply.github.com> Date: Thu, 29 Aug 2024 20:53:28 +0200 Subject: [PATCH 38/72] [CodeGen] Allow mixed scalar type constraints for inline asm (#65465) GCC supports code like "asm volatile ("" : "=r" (i) : "0" (f))" where i is integer type and f is floating point type. Currently this code produces an error with Clang. The change allows mixed scalar types between input and output constraints. Co-authored-by: Matt Arsenault --- .../SelectionDAG/SelectionDAGBuilder.cpp | 8 ++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 7 ++- llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll | 61 +++++++++++++++++++ 3 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 521a4fee8aafe0..4b326ba76f97f2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9591,9 +9591,11 @@ static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo, std::pair InputRC = TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode, MatchingOpInfo.ConstraintVT); - if ((OpInfo.ConstraintVT.isInteger() != - MatchingOpInfo.ConstraintVT.isInteger()) || - (MatchRC.second != InputRC.second)) { + const bool OutOpIsIntOrFP = + OpInfo.ConstraintVT.isInteger() || OpInfo.ConstraintVT.isFloatingPoint(); + const bool InOpIsIntOrFP = MatchingOpInfo.ConstraintVT.isInteger() || + MatchingOpInfo.ConstraintVT.isFloatingPoint(); + if ((OutOpIsIntOrFP != InOpIsIntOrFP) || (MatchRC.second != InputRC.second)) { // FIXME: error out in a more elegant fashion report_fatal_error("Unsupported asm: input constraint" " with a matching output constraint of" diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 4e796289cff0a1..01feec0c435edf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5856,8 +5856,11 @@ TargetLowering::ParseConstraints(const DataLayout &DL, std::pair InputRC = getRegForInlineAsmConstraint(TRI, Input.ConstraintCode, Input.ConstraintVT); - if ((OpInfo.ConstraintVT.isInteger() != - Input.ConstraintVT.isInteger()) || + const bool OutOpIsIntOrFP = OpInfo.ConstraintVT.isInteger() || + OpInfo.ConstraintVT.isFloatingPoint(); + const bool InOpIsIntOrFP = Input.ConstraintVT.isInteger() || + Input.ConstraintVT.isFloatingPoint(); + if ((OutOpIsIntOrFP != InOpIsIntOrFP) || (MatchRC.second != InputRC.second)) { report_fatal_error("Unsupported asm: input constraint" " with a matching output constraint of" diff --git a/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll b/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll new file mode 100644 index 00000000000000..d2255d9970b123 --- /dev/null +++ b/llvm/test/CodeGen/X86/inline-asm-int-to-fp.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr +avx < %s | FileCheck %s + +; The C source used as a base for generating this test:. + +; unsigned test(float f) +; { +; unsigned i; +; // Copies f into the output operand i +; asm volatile ("" : "=r" (i) : "0" (f)); +; return i; +; } + + +define i32 @test_int_float(float %f) { +; CHECK-LABEL: test_int_float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: retq +entry: + %asm_call = call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(float %f) + ret i32 %asm_call +} + +define i32 @test_int_ptr(ptr %f) { +; CHECK-LABEL: test_int_ptr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq +entry: + %asm_call = call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(ptr %f) + ret i32 %asm_call +} + +define i64 @test_int_vec(<4 x i16> %v) { +; CHECK-LABEL: test_int_vec: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: retq +entry: + %asm_call = call i64 asm sideeffect "", "=v,0,~{dirflag},~{fpsr},~{flags}"(<4 x i16> %v) + ret i64 %asm_call +} + +define <4 x i32> @test_int_vec_float_vec(<4 x float> %f) { +; CHECK-LABEL: test_int_vec_float_vec: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: retq +entry: + %asm_call = call <4 x i32> asm sideeffect "", "=v,0,~{dirflag},~{fpsr},~{flags}"(<4 x float> %f) + ret <4 x i32> %asm_call +} From ff04c5b2e69481fc3b828bfcf32e05ff7a2c4b05 Mon Sep 17 00:00:00 2001 From: Dan Liew Date: Thu, 29 Aug 2024 12:00:28 -0700 Subject: [PATCH 39/72] [NFC][Sema] Move `Sema::AssignmentAction` into its own scoped enum (#106453) The primary motivation behind this is to allow the enum type to be referred to earlier in the Sema.h file which is needed for #106321. It was requested in #106321 that a scoped enum be used (rather than moving the enum declaration earlier in the Sema class declaration). Unfortunately doing this creates a lot of churn as all use sites of the enum constants had to be changed. Appologies to all downstream forks in advanced. Note the AA_ prefix has been dropped from the enum value names as they are now redundant. --- clang/include/clang/Sema/Sema.h | 31 +++++++----- clang/lib/Sema/SemaARM.cpp | 5 +- clang/lib/Sema/SemaCast.cpp | 4 +- clang/lib/Sema/SemaChecking.cpp | 3 +- clang/lib/Sema/SemaDeclCXX.cpp | 3 +- clang/lib/Sema/SemaExpr.cpp | 32 ++++++------ clang/lib/Sema/SemaExprCXX.cpp | 47 +++++++++--------- clang/lib/Sema/SemaInit.cpp | 23 ++++----- clang/lib/Sema/SemaOpenMP.cpp | 76 ++++++++++++++++------------- clang/lib/Sema/SemaOverload.cpp | 30 +++++++----- clang/lib/Sema/SemaPseudoObject.cpp | 2 +- clang/lib/Sema/SemaStmt.cpp | 3 +- 12 files changed, 142 insertions(+), 117 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 821182e8356428..0358259945c796 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -204,6 +204,24 @@ class SemaPPCallbacks; class TemplateDeductionInfo; } // namespace sema +// AssignmentAction - This is used by all the assignment diagnostic functions +// to represent what is actually causing the operation +enum class AssignmentAction { + Assigning, + Passing, + Returning, + Converting, + Initializing, + Sending, + Casting, + Passing_CFAudited +}; +inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, + const AssignmentAction &AA) { + DB << llvm::to_underlying(AA); + return DB; +} + namespace threadSafety { class BeforeSet; void threadSafetyCleanup(BeforeSet *Cache); @@ -6493,19 +6511,6 @@ class Sema final : public SemaBase { /// cleanup that are created by the current full expression. SmallVector ExprCleanupObjects; - // AssignmentAction - This is used by all the assignment diagnostic functions - // to represent what is actually causing the operation - enum AssignmentAction { - AA_Assigning, - AA_Passing, - AA_Returning, - AA_Converting, - AA_Initializing, - AA_Sending, - AA_Casting, - AA_Passing_CFAudited - }; - /// Determine whether the use of this declaration is valid, without /// emitting diagnostics. bool CanUseDecl(NamedDecl *D, bool TreatUnavailableAsInvalid); diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp index e18872f0dc551e..185e0427d5c995 100644 --- a/clang/lib/Sema/SemaARM.cpp +++ b/clang/lib/Sema/SemaARM.cpp @@ -795,7 +795,8 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, if (RHS.isInvalid()) return true; if (SemaRef.DiagnoseAssignmentResult(ConvTy, Arg->getBeginLoc(), LHSTy, - RHSTy, RHS.get(), Sema::AA_Assigning)) + RHSTy, RHS.get(), + AssignmentAction::Assigning)) return true; } @@ -921,7 +922,7 @@ bool SemaARM::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CastNeeded = CK_BitCast; Diag(DRE->getBeginLoc(), diag::ext_typecheck_convert_discards_qualifiers) << PointerArg->getType() << Context.getPointerType(AddrType) - << Sema::AA_Passing << PointerArg->getSourceRange(); + << AssignmentAction::Passing << PointerArg->getSourceRange(); } // Finally, do the cast and replace the argument with the corrected version. diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index eca8363ee9605c..f01b22a72915c8 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -2673,7 +2673,7 @@ void CastOperation::checkAddressSpaceCast(QualType SrcType, QualType DestType) { ? DestPPointee.getAddressSpace() != SrcPPointee.getAddressSpace() : !DestPPointee.isAddressSpaceOverlapping(SrcPPointee)) { Self.Diag(OpRange.getBegin(), DiagID) - << SrcType << DestType << Sema::AA_Casting + << SrcType << DestType << AssignmentAction::Casting << SrcExpr.get()->getSourceRange(); if (!Nested) SrcExpr = ExprError(); @@ -3213,7 +3213,7 @@ void CastOperation::CheckCStyleCast() { !CastQuals.compatiblyIncludesObjCLifetime(ExprQuals)) { Self.Diag(SrcExpr.get()->getBeginLoc(), diag::err_typecheck_incompatible_ownership) - << SrcType << DestType << Sema::AA_Casting + << SrcType << DestType << AssignmentAction::Casting << SrcExpr.get()->getSourceRange(); return; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index ee143381cf4f79..b021e27209cf1b 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4880,7 +4880,8 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs, if (Arg->isTypeDependent()) return false; - ExprResult Res = PerformImplicitConversion(Arg, Context.IntTy, AA_Passing); + ExprResult Res = PerformImplicitConversion(Arg, Context.IntTy, + AssignmentAction::Passing); if (Res.isInvalid()) return true; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index d89a47f3e6226a..3044f1218f5b23 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -10871,7 +10871,8 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) { ExprResult This = ActOnCXXThis(OperatorDelete->getParamDecl(0)->getLocation()); assert(!This.isInvalid() && "couldn't form 'this' expr in dtor?"); - This = PerformImplicitConversion(This.get(), ParamType, AA_Passing); + This = PerformImplicitConversion(This.get(), ParamType, + AssignmentAction::Passing); if (This.isInvalid()) { // FIXME: Register this as a context note so that it comes out // in the right order. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index de316f30e9523d..dcb08790911e74 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9586,7 +9586,7 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS, QualType RHSType = RHS.get()->getType(); if (Diagnose) { RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(), - AA_Assigning); + AssignmentAction::Assigning); } else { ImplicitConversionSequence ICS = TryImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(), @@ -9598,7 +9598,7 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS, if (ICS.isFailure()) return Incompatible; RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(), - ICS, AA_Assigning); + ICS, AssignmentAction::Assigning); } if (RHS.isInvalid()) return Incompatible; @@ -13660,8 +13660,8 @@ QualType Sema::CheckAssignmentOperands(Expr *LHSExpr, ExprResult &RHS, ConvTy = CheckAssignmentConstraints(Loc, LHSType, RHSType); } - if (DiagnoseAssignmentResult(ConvTy, Loc, LHSType, RHSType, - RHS.get(), AA_Assigning)) + if (DiagnoseAssignmentResult(ConvTy, Loc, LHSType, RHSType, RHS.get(), + AssignmentAction::Assigning)) return QualType(); CheckForNullPointerDereference(*this, LHSExpr); @@ -16669,7 +16669,7 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy, MayHaveConvFixit = true; break; case IncompatiblePointer: - if (Action == AA_Passing_CFAudited) { + if (Action == AssignmentAction::Passing_CFAudited) { DiagKind = diag::err_arc_typecheck_convert_incompatible_pointer; } else if (getLangOpts().CPlusPlus) { DiagKind = diag::err_typecheck_convert_incompatible_pointer; @@ -16823,19 +16823,19 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy, QualType FirstType, SecondType; switch (Action) { - case AA_Assigning: - case AA_Initializing: + case AssignmentAction::Assigning: + case AssignmentAction::Initializing: // The destination type comes first. FirstType = DstType; SecondType = SrcType; break; - case AA_Returning: - case AA_Passing: - case AA_Passing_CFAudited: - case AA_Converting: - case AA_Sending: - case AA_Casting: + case AssignmentAction::Returning: + case AssignmentAction::Passing: + case AssignmentAction::Passing_CFAudited: + case AssignmentAction::Converting: + case AssignmentAction::Sending: + case AssignmentAction::Casting: // The source type comes first. FirstType = SrcType; SecondType = DstType; @@ -16844,8 +16844,8 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy, PartialDiagnostic FDiag = PDiag(DiagKind); AssignmentAction ActionForDiag = Action; - if (Action == AA_Passing_CFAudited) - ActionForDiag = AA_Passing; + if (Action == AssignmentAction::Passing_CFAudited) + ActionForDiag = AssignmentAction::Passing; FDiag << FirstType << SecondType << ActionForDiag << SrcExpr->getSourceRange(); @@ -16885,7 +16885,7 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy, if (CheckInferredResultType) ObjC().EmitRelatedResultTypeNote(SrcExpr); - if (Action == AA_Returning && ConvTy == IncompatiblePointer) + if (Action == AssignmentAction::Returning && ConvTy == IncompatiblePointer) ObjC().EmitRelatedResultTypeNoteForReturn(DstType); if (Complained) diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index d8719ab26cc83f..b7531581d37ff0 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -2199,8 +2199,8 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal, if (getLangOpts().CPlusPlus14) { assert(Context.getTargetInfo().getIntWidth() && "Builtin type of size 0?"); - ConvertedSize = PerformImplicitConversion(*ArraySize, Context.getSizeType(), - AA_Converting); + ConvertedSize = PerformImplicitConversion( + *ArraySize, Context.getSizeType(), AssignmentAction::Converting); if (!ConvertedSize.isInvalid() && (*ArraySize)->getType()->getAs()) @@ -3851,7 +3851,8 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal, Context.getQualifiedType(Pointee.getUnqualifiedType(), Qs)); Ex = ImpCastExprToType(Ex.get(), Unqual, CK_NoOp); } - Ex = PerformImplicitConversion(Ex.get(), ParamType, AA_Passing); + Ex = PerformImplicitConversion(Ex.get(), ParamType, + AssignmentAction::Passing); if (Ex.isInvalid()) return ExprError(); } @@ -4256,10 +4257,9 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, } // Watch out for ellipsis conversion. if (!ICS.UserDefined.EllipsisConversion) { - ExprResult Res = - PerformImplicitConversion(From, BeforeToType, - ICS.UserDefined.Before, AA_Converting, - CCK); + ExprResult Res = PerformImplicitConversion( + From, BeforeToType, ICS.UserDefined.Before, + AssignmentAction::Converting, CCK); if (Res.isInvalid()) return ExprError(); From = Res.get(); @@ -4282,7 +4282,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, return From; return PerformImplicitConversion(From, ToType, ICS.UserDefined.After, - AA_Converting, CCK); + AssignmentAction::Converting, CCK); } case ImplicitConversionSequence::AmbiguousConversion: @@ -4451,19 +4451,19 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, // target entity shall allow at least the exceptions allowed by the // source value in the assignment or initialization. switch (Action) { - case AA_Assigning: - case AA_Initializing: + case AssignmentAction::Assigning: + case AssignmentAction::Initializing: // Note, function argument passing and returning are initialization. - case AA_Passing: - case AA_Returning: - case AA_Sending: - case AA_Passing_CFAudited: + case AssignmentAction::Passing: + case AssignmentAction::Returning: + case AssignmentAction::Sending: + case AssignmentAction::Passing_CFAudited: if (CheckExceptionSpecCompatibility(From, ToType)) return ExprError(); break; - case AA_Casting: - case AA_Converting: + case AssignmentAction::Casting: + case AssignmentAction::Converting: // Casts and implicit conversions are not initialization, so are not // checked for exception specification mismatches. break; @@ -4577,9 +4577,10 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, case ICK_Writeback_Conversion: case ICK_Pointer_Conversion: { - if (SCS.IncompatibleObjC && Action != AA_Casting) { + if (SCS.IncompatibleObjC && Action != AssignmentAction::Casting) { // Diagnose incompatible Objective-C conversions - if (Action == AA_Initializing || Action == AA_Assigning) + if (Action == AssignmentAction::Initializing || + Action == AssignmentAction::Assigning) Diag(From->getBeginLoc(), diag::ext_typecheck_convert_incompatible_pointer) << ToType << From->getType() << Action << From->getSourceRange() @@ -4596,12 +4597,12 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, } else if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() && !ObjC().CheckObjCARCUnavailableWeakConversion(ToType, From->getType())) { - if (Action == AA_Initializing) + if (Action == AssignmentAction::Initializing) Diag(From->getBeginLoc(), diag::err_arc_weak_unavailable_assign); else Diag(From->getBeginLoc(), diag::err_arc_convesion_of_weak_unavailable) - << (Action == AA_Casting) << From->getType() << ToType - << From->getSourceRange(); + << (Action == AssignmentAction::Casting) << From->getType() + << ToType << From->getSourceRange(); } // Defer address space conversion to the third conversion. @@ -6666,14 +6667,14 @@ static bool FindConditionalOverload(Sema &Self, ExprResult &LHS, ExprResult &RHS // We found a match. Perform the conversions on the arguments and move on. ExprResult LHSRes = Self.PerformImplicitConversion( LHS.get(), Best->BuiltinParamTypes[0], Best->Conversions[0], - Sema::AA_Converting); + AssignmentAction::Converting); if (LHSRes.isInvalid()) break; LHS = LHSRes; ExprResult RHSRes = Self.PerformImplicitConversion( RHS.get(), Best->BuiltinParamTypes[1], Best->Conversions[1], - Sema::AA_Converting); + AssignmentAction::Converting); if (RHSRes.isInvalid()) break; RHS = RHSRes; diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 5a19a3505454ca..7dc17187524621 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6799,43 +6799,44 @@ InitializationSequence::~InitializationSequence() { //===----------------------------------------------------------------------===// // Perform initialization //===----------------------------------------------------------------------===// -static Sema::AssignmentAction -getAssignmentAction(const InitializedEntity &Entity, bool Diagnose = false) { +static AssignmentAction getAssignmentAction(const InitializedEntity &Entity, + bool Diagnose = false) { switch(Entity.getKind()) { case InitializedEntity::EK_Variable: case InitializedEntity::EK_New: case InitializedEntity::EK_Exception: case InitializedEntity::EK_Base: case InitializedEntity::EK_Delegating: - return Sema::AA_Initializing; + return AssignmentAction::Initializing; case InitializedEntity::EK_Parameter: if (Entity.getDecl() && isa(Entity.getDecl()->getDeclContext())) - return Sema::AA_Sending; + return AssignmentAction::Sending; - return Sema::AA_Passing; + return AssignmentAction::Passing; case InitializedEntity::EK_Parameter_CF_Audited: if (Entity.getDecl() && isa(Entity.getDecl()->getDeclContext())) - return Sema::AA_Sending; + return AssignmentAction::Sending; - return !Diagnose ? Sema::AA_Passing : Sema::AA_Passing_CFAudited; + return !Diagnose ? AssignmentAction::Passing + : AssignmentAction::Passing_CFAudited; case InitializedEntity::EK_Result: case InitializedEntity::EK_StmtExprResult: // FIXME: Not quite right. - return Sema::AA_Returning; + return AssignmentAction::Returning; case InitializedEntity::EK_Temporary: case InitializedEntity::EK_RelatedResult: // FIXME: Can we tell apart casting vs. converting? - return Sema::AA_Casting; + return AssignmentAction::Casting; case InitializedEntity::EK_TemplateParameter: // This is really initialization, but refer to it as conversion for // consistency with CheckConvertedConstantExpression. - return Sema::AA_Converting; + return AssignmentAction::Converting; case InitializedEntity::EK_Member: case InitializedEntity::EK_ParenAggInitMember: @@ -6847,7 +6848,7 @@ getAssignmentAction(const InitializedEntity &Entity, bool Diagnose = false) { case InitializedEntity::EK_LambdaToBlockConversionBlockElement: case InitializedEntity::EK_LambdaCapture: case InitializedEntity::EK_CompoundLiteralInit: - return Sema::AA_Initializing; + return AssignmentAction::Initializing; } llvm_unreachable("Invalid EntityKind!"); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 74c646f64b42f2..23c4903ec15855 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -7395,7 +7395,8 @@ SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG, return std::nullopt; } VariantRefCast = SemaRef.PerformImplicitConversion( - VariantRef, FnPtrType.getUnqualifiedType(), Sema::AA_Converting); + VariantRef, FnPtrType.getUnqualifiedType(), + AssignmentAction::Converting); if (!VariantRefCast.isUsable()) return std::nullopt; } @@ -8415,9 +8416,10 @@ tryBuildCapture(Sema &SemaRef, Expr *Capture, if (SemaRef.CurContext->isDependentContext() || Capture->containsErrors()) return Capture; if (Capture->isEvaluatable(SemaRef.Context, Expr::SE_AllowSideEffects)) - return SemaRef.PerformImplicitConversion( - Capture->IgnoreImpCasts(), Capture->getType(), Sema::AA_Converting, - /*AllowExplicit=*/true); + return SemaRef.PerformImplicitConversion(Capture->IgnoreImpCasts(), + Capture->getType(), + AssignmentAction::Converting, + /*AllowExplicit=*/true); auto I = Captures.find(Capture); if (I != Captures.end()) return buildCapture(SemaRef, Capture, I->second, Name); @@ -8517,7 +8519,7 @@ calculateNumIters(Sema &SemaRef, Scope *S, SourceLocation DefaultLoc, SemaRef .PerformImplicitConversion( SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(), - CastType, Sema::AA_Converting) + CastType, AssignmentAction::Converting) .get(); Lower = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get(); NewStep = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, NewStep.get()); @@ -8801,8 +8803,9 @@ Expr *OpenMPIterationSpaceChecker::buildNumIterations( : Type->hasSignedIntegerRepresentation(); Type = C.getIntTypeForBitwidth(NewSize, IsSigned); if (!SemaRef.Context.hasSameType(Diff.get()->getType(), Type)) { - Diff = SemaRef.PerformImplicitConversion( - Diff.get(), Type, Sema::AA_Converting, /*AllowExplicit=*/true); + Diff = SemaRef.PerformImplicitConversion(Diff.get(), Type, + AssignmentAction::Converting, + /*AllowExplicit=*/true); if (!Diff.isUsable()) return nullptr; } @@ -8820,7 +8823,8 @@ Expr *OpenMPIterationSpaceChecker::buildNumIterations( C.getTypeSize(Type) < NewSize); if (!SemaRef.Context.hasSameType(Diff.get()->getType(), NewType)) { Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType, - Sema::AA_Converting, true); + AssignmentAction::Converting, + /*AllowExplicit=*/true); if (!Diff.isUsable()) return nullptr; } @@ -8892,7 +8896,7 @@ std::pair OpenMPIterationSpaceChecker::buildMinMaxValues( SemaRef.Context.getUnsignedPointerDiffType())) { Diff = SemaRef.PerformImplicitConversion( Diff.get(), SemaRef.Context.getUnsignedPointerDiffType(), - Sema::AA_Converting, /*AllowExplicit=*/true); + AssignmentAction::Converting, /*AllowExplicit=*/true); } if (!Diff.isUsable()) return std::make_pair(nullptr, nullptr); @@ -8920,7 +8924,7 @@ std::pair OpenMPIterationSpaceChecker::buildMinMaxValues( // Convert to the original type. if (SemaRef.Context.hasSameType(Diff.get()->getType(), VarType)) Diff = SemaRef.PerformImplicitConversion(Diff.get(), VarType, - Sema::AA_Converting, + AssignmentAction::Converting, /*AllowExplicit=*/true); if (!Diff.isUsable()) return std::make_pair(nullptr, nullptr); @@ -8955,7 +8959,7 @@ Expr *OpenMPIterationSpaceChecker::buildPreCond( return SemaRef .PerformImplicitConversion( SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get(), - SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting, + SemaRef.Context.BoolTy, /*Action=*/AssignmentAction::Casting, /*AllowExplicit=*/true) .get(); @@ -8976,7 +8980,8 @@ Expr *OpenMPIterationSpaceChecker::buildPreCond( if (!SemaRef.Context.hasSameUnqualifiedType(CondExpr.get()->getType(), SemaRef.Context.BoolTy)) CondExpr = SemaRef.PerformImplicitConversion( - CondExpr.get(), SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting, + CondExpr.get(), SemaRef.Context.BoolTy, + /*Action=*/AssignmentAction::Casting, /*AllowExplicit=*/true); } @@ -9393,7 +9398,7 @@ buildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef, if (!SemaRef.Context.hasSameType(NewStart.get()->getType(), VarRef.get()->getType())) { NewStart = SemaRef.PerformImplicitConversion( - NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting, + NewStart.get(), VarRef.get()->getType(), AssignmentAction::Converting, /*AllowExplicit=*/true); if (!NewStart.isUsable()) return ExprError(); @@ -9469,7 +9474,8 @@ static ExprResult buildCounterUpdate( if (!SemaRef.Context.hasSameType(Update.get()->getType(), VarRef.get()->getType())) { Update = SemaRef.PerformImplicitConversion( - Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true); + Update.get(), VarRef.get()->getType(), AssignmentAction::Converting, + /*AllowExplicit=*/true); if (!Update.isUsable()) return ExprError(); } @@ -9491,8 +9497,8 @@ static ExprResult widenIterationCount(unsigned Bits, Expr *E, Sema &SemaRef) { return ExprResult(E); // OK to convert to signed, because new type has more bits than old. QualType NewType = C.getIntTypeForBitwidth(Bits, /*Signed=*/true); - return SemaRef.PerformImplicitConversion(E, NewType, Sema::AA_Converting, - true); + return SemaRef.PerformImplicitConversion( + E, NewType, AssignmentAction::Converting, /*AllowExplicit=*/true); } /// Check if the given expression \a E is a constant integer that fits @@ -9752,19 +9758,19 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, // true). auto PreCond = ExprResult(IterSpaces[0].PreCond); Expr *N0 = IterSpaces[0].NumIterations; - ExprResult LastIteration32 = - widenIterationCount(/*Bits=*/32, - SemaRef - .PerformImplicitConversion( - N0->IgnoreImpCasts(), N0->getType(), - Sema::AA_Converting, /*AllowExplicit=*/true) - .get(), - SemaRef); + ExprResult LastIteration32 = widenIterationCount( + /*Bits=*/32, + SemaRef + .PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(), + AssignmentAction::Converting, + /*AllowExplicit=*/true) + .get(), + SemaRef); ExprResult LastIteration64 = widenIterationCount( /*Bits=*/64, SemaRef .PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(), - Sema::AA_Converting, + AssignmentAction::Converting, /*AllowExplicit=*/true) .get(), SemaRef); @@ -9790,7 +9796,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, CurScope, Loc, BO_Mul, LastIteration32.get(), SemaRef .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(), - Sema::AA_Converting, + AssignmentAction::Converting, /*AllowExplicit=*/true) .get()); if (LastIteration64.isUsable()) @@ -9798,7 +9804,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, CurScope, Loc, BO_Mul, LastIteration64.get(), SemaRef .PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(), - Sema::AA_Converting, + AssignmentAction::Converting, /*AllowExplicit=*/true) .get()); } @@ -11538,7 +11544,7 @@ bool OpenMPAtomicUpdateChecker::checkStatement(Stmt *S, unsigned DiagId, if (Update.isInvalid()) return true; Update = SemaRef.PerformImplicitConversion(Update.get(), X->getType(), - Sema::AA_Casting); + AssignmentAction::Casting); if (Update.isInvalid()) return true; UpdateExpr = Update.get(); @@ -15655,7 +15661,7 @@ static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc, break; } Res = S.PerformImplicitConversion(Res.get(), AllocatorHandleEnumTy, - Sema::AA_Initializing, + AssignmentAction::Initializing, /*AllowExplicit=*/true); if (!Res.isUsable()) { ErrorFound = true; @@ -15686,7 +15692,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocatorClause(Expr *A, return nullptr; Allocator = SemaRef.PerformImplicitConversion( Allocator.get(), DSAStack->getOMPAllocatorHandleT(), - Sema::AA_Initializing, + AssignmentAction::Initializing, /*AllowExplicit=*/true); if (Allocator.isInvalid()) return nullptr; @@ -23096,7 +23102,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( return nullptr; AllocatorRes = SemaRef.PerformImplicitConversion( AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), - Sema::AA_Initializing, + AssignmentAction::Initializing, /*AllowExplicit=*/true); if (AllocatorRes.isInvalid()) return nullptr; @@ -23939,14 +23945,14 @@ ExprResult SemaOpenMP::ActOnOMPIteratorExpr(Scope *S, Expr *Begin = D.Range.Begin; if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) { - ExprResult BeginRes = - SemaRef.PerformImplicitConversion(Begin, DeclTy, Sema::AA_Converting); + ExprResult BeginRes = SemaRef.PerformImplicitConversion( + Begin, DeclTy, AssignmentAction::Converting); Begin = BeginRes.get(); } Expr *End = D.Range.End; if (!IsDeclTyDependent && End && !End->isTypeDependent()) { - ExprResult EndRes = - SemaRef.PerformImplicitConversion(End, DeclTy, Sema::AA_Converting); + ExprResult EndRes = SemaRef.PerformImplicitConversion( + End, DeclTy, AssignmentAction::Converting); End = EndRes.get(); } Expr *Step = D.Range.Step; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 1ce0fa091938d7..a3c13e21c709cb 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1811,9 +1811,9 @@ ExprResult Sema::PerformImplicitConversion(Expr *From, QualType ToType, return ExprError(); // Objective-C ARC: Determine whether we will allow the writeback conversion. - bool AllowObjCWritebackConversion - = getLangOpts().ObjCAutoRefCount && - (Action == AA_Passing || Action == AA_Sending); + bool AllowObjCWritebackConversion = + getLangOpts().ObjCAutoRefCount && (Action == AssignmentAction::Passing || + Action == AssignmentAction::Sending); if (getLangOpts().ObjC) ObjC().CheckObjCBridgeRelatedConversions(From->getBeginLoc(), ToType, From->getType(), From); @@ -5983,7 +5983,8 @@ ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) { ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From); if (!ICS.isBad()) - return PerformImplicitConversion(From, Context.BoolTy, ICS, AA_Converting); + return PerformImplicitConversion(From, Context.BoolTy, ICS, + AssignmentAction::Converting); if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy)) return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition) @@ -6149,7 +6150,8 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From, T, cast(Dest)), SourceLocation(), From); } else { - Result = S.PerformImplicitConversion(From, T, ICS, Sema::AA_Converting); + Result = + S.PerformImplicitConversion(From, T, ICS, AssignmentAction::Converting); } if (Result.isInvalid()) return Result; @@ -6370,7 +6372,8 @@ ExprResult Sema::PerformContextuallyConvertToObjCPointer(Expr *From) { ImplicitConversionSequence ICS = TryContextuallyConvertToObjCPointer(*this, From); if (!ICS.isBad()) - return PerformImplicitConversion(From, Ty, ICS, AA_Converting); + return PerformImplicitConversion(From, Ty, ICS, + AssignmentAction::Converting); return ExprResult(); } @@ -14363,7 +14366,8 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, // break out so that we will build the appropriate built-in // operator node. ExprResult InputRes = PerformImplicitConversion( - Input, Best->BuiltinParamTypes[0], Best->Conversions[0], AA_Passing, + Input, Best->BuiltinParamTypes[0], Best->Conversions[0], + AssignmentAction::Passing, CheckedConversionKind::ForBuiltinOverloadedOp); if (InputRes.isInvalid()) return ExprError(); @@ -14825,14 +14829,16 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // operator node. ExprResult ArgsRes0 = PerformImplicitConversion( Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0], - AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); + AssignmentAction::Passing, + CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes0.isInvalid()) return ExprError(); Args[0] = ArgsRes0.get(); ExprResult ArgsRes1 = PerformImplicitConversion( Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1], - AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); + AssignmentAction::Passing, + CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes1.isInvalid()) return ExprError(); Args[1] = ArgsRes1.get(); @@ -15203,14 +15209,16 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc, // operator node. ExprResult ArgsRes0 = PerformImplicitConversion( Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0], - AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); + AssignmentAction::Passing, + CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes0.isInvalid()) return ExprError(); Args[0] = ArgsRes0.get(); ExprResult ArgsRes1 = PerformImplicitConversion( Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1], - AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); + AssignmentAction::Passing, + CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes1.isInvalid()) return ExprError(); Args[1] = ArgsRes1.get(); diff --git a/clang/lib/Sema/SemaPseudoObject.cpp b/clang/lib/Sema/SemaPseudoObject.cpp index fdb584ceb81059..30ed47e6e56ec9 100644 --- a/clang/lib/Sema/SemaPseudoObject.cpp +++ b/clang/lib/Sema/SemaPseudoObject.cpp @@ -787,7 +787,7 @@ ExprResult ObjCPropertyOpBuilder::buildSet(Expr *op, SourceLocation opcLoc, if (opResult.isInvalid() || S.DiagnoseAssignmentResult(assignResult, opcLoc, paramType, op->getType(), opResult.get(), - Sema::AA_Assigning)) + AssignmentAction::Assigning)) return ExprError(); op = opResult.get(); diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index ba681671eb3290..9664287b9a3fe9 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3151,7 +3151,8 @@ Sema::ActOnIndirectGotoStmt(SourceLocation GotoLoc, SourceLocation StarLoc, if (ExprRes.isInvalid()) return StmtError(); E = ExprRes.get(); - if (DiagnoseAssignmentResult(ConvTy, StarLoc, DestTy, ETy, E, AA_Passing)) + if (DiagnoseAssignmentResult(ConvTy, StarLoc, DestTy, ETy, E, + AssignmentAction::Passing)) return StmtError(); } From a0441ced7a770036e00610989e2fabba5caeb31b Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 29 Aug 2024 16:01:57 -0300 Subject: [PATCH 40/72] [NFC] whitespace cleanup on clang/test/SemaTemplate/temp_arg_nontype.cpp --- clang/test/SemaTemplate/temp_arg_nontype.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp index da42f85fb910c8..f360aa14950edd 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp @@ -128,28 +128,28 @@ namespace ns { struct Foo { static const bool value = true; }; - + template struct Bar {}; - + const bool value = false; - + Bar::value)> x; } // PR5349 namespace ns { enum E { k }; - + template struct Baz {}; - + Baz f1; // This works. Baz f2; // This too. Baz(0)> f3; // And this. - + Baz b1; // This doesn't work. - Baz(0)> b2; // This neither. + Baz(0)> b2; // This neither. } // PR5597 @@ -193,7 +193,7 @@ namespace EntityReferenced { template struct Y { - static void f(T x) { + static void f(T x) { x = 1; // expected-error{{incompatible integer to pointer conversion assigning to 'int *' from 'int'}} } }; @@ -208,7 +208,7 @@ namespace PR6964 { // expected-note {{template parameter is declared here}} struct as_nview { }; - template + template struct as_nview // expected-note{{while checking a default template argument used here}} { }; } @@ -235,7 +235,7 @@ namespace test8 { char y; double z; }; - + template struct B { C* p; B() : p(cp) {} From a87105121dd300752c19024ebaf93319c2781a8b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 29 Aug 2024 14:18:37 -0500 Subject: [PATCH 41/72] [libc] Implement locale variants for 'stdlib.h' functions (#105718) Summary: This provides the `_l` variants for the `stdlib.h` functions. These are just copies of the same entrypoint and don't do anything with the locale information. --- libc/config/gpu/entrypoints.txt | 7 ++ libc/config/linux/x86_64/entrypoints.txt | 9 +++ libc/include/llvm-libc-macros/stdlib-macros.h | 5 ++ libc/include/stdlib.h.def | 1 + libc/newhdrgen/yaml/stdlib.yaml | 60 +++++++++++++++ libc/spec/stdc.td | 8 ++ libc/src/stdlib/CMakeLists.txt | 77 +++++++++++++++++++ libc/src/stdlib/strtod_l.cpp | 30 ++++++++ libc/src/stdlib/strtod_l.h | 22 ++++++ libc/src/stdlib/strtof_l.cpp | 30 ++++++++ libc/src/stdlib/strtof_l.h | 22 ++++++ libc/src/stdlib/strtol_l.cpp | 30 ++++++++ libc/src/stdlib/strtol_l.h | 22 ++++++ libc/src/stdlib/strtold_l.cpp | 30 ++++++++ libc/src/stdlib/strtold_l.h | 22 ++++++ libc/src/stdlib/strtoll_l.cpp | 30 ++++++++ libc/src/stdlib/strtoll_l.h | 22 ++++++ libc/src/stdlib/strtoul_l.cpp | 30 ++++++++ libc/src/stdlib/strtoul_l.h | 22 ++++++ libc/src/stdlib/strtoull_l.cpp | 30 ++++++++ libc/src/stdlib/strtoull_l.h | 23 ++++++ 21 files changed, 532 insertions(+) create mode 100644 libc/src/stdlib/strtod_l.cpp create mode 100644 libc/src/stdlib/strtod_l.h create mode 100644 libc/src/stdlib/strtof_l.cpp create mode 100644 libc/src/stdlib/strtof_l.h create mode 100644 libc/src/stdlib/strtol_l.cpp create mode 100644 libc/src/stdlib/strtol_l.h create mode 100644 libc/src/stdlib/strtold_l.cpp create mode 100644 libc/src/stdlib/strtold_l.h create mode 100644 libc/src/stdlib/strtoll_l.cpp create mode 100644 libc/src/stdlib/strtoll_l.h create mode 100644 libc/src/stdlib/strtoul_l.cpp create mode 100644 libc/src/stdlib/strtoul_l.h create mode 100644 libc/src/stdlib/strtoull_l.cpp create mode 100644 libc/src/stdlib/strtoull_l.h diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index db7cd24dadb7fc..d8f78f0d174534 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -173,12 +173,19 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdlib.rand libc.src.stdlib.srand libc.src.stdlib.strtod + libc.src.stdlib.strtod_l libc.src.stdlib.strtof + libc.src.stdlib.strtof_l libc.src.stdlib.strtol + libc.src.stdlib.strtol_l libc.src.stdlib.strtold + libc.src.stdlib.strtold_l libc.src.stdlib.strtoll + libc.src.stdlib.strtoll_l libc.src.stdlib.strtoul + libc.src.stdlib.strtoul_l libc.src.stdlib.strtoull + libc.src.stdlib.strtoull_l libc.src.stdlib.at_quick_exit libc.src.stdlib.quick_exit libc.src.stdlib.getenv diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 141dc70463d64a..0aa38c7afc76f4 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -800,6 +800,15 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.ctype.tolower_l libc.src.ctype.toupper_l + # stdlib.h entrypoints + libc.src.stdlib.strtod_l + libc.src.stdlib.strtof_l + libc.src.stdlib.strtol_l + libc.src.stdlib.strtold_l + libc.src.stdlib.strtoll_l + libc.src.stdlib.strtoul_l + libc.src.stdlib.strtoull_l + # assert.h entrypoints libc.src.assert.__assert_fail diff --git a/libc/include/llvm-libc-macros/stdlib-macros.h b/libc/include/llvm-libc-macros/stdlib-macros.h index 5fcbfef97b3285..2565c76be3c55c 100644 --- a/libc/include/llvm-libc-macros/stdlib-macros.h +++ b/libc/include/llvm-libc-macros/stdlib-macros.h @@ -17,6 +17,11 @@ #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 +#ifndef MB_CUR_MAX +// We only support the "C" locale right now, so this is a constant byte. +#define MB_CUR_MAX 1 +#endif // MB_CUR_MAX + #define RAND_MAX 2147483647 #endif // LLVM_LIBC_MACROS_STDLIB_MACROS_H diff --git a/libc/include/stdlib.h.def b/libc/include/stdlib.h.def index d523f7a53024aa..01b0e1a2395a29 100644 --- a/libc/include/stdlib.h.def +++ b/libc/include/stdlib.h.def @@ -10,6 +10,7 @@ #define LLVM_LIBC_STDLIB_H #include "__llvm-libc-common.h" +#include "llvm-libc-types/locale_t.h" #include "llvm-libc-macros/stdlib-macros.h" %%public_api() diff --git a/libc/newhdrgen/yaml/stdlib.yaml b/libc/newhdrgen/yaml/stdlib.yaml index 081da5391c3a52..5da49b8a89101c 100644 --- a/libc/newhdrgen/yaml/stdlib.yaml +++ b/libc/newhdrgen/yaml/stdlib.yaml @@ -273,3 +273,63 @@ functions: - type: const char *__restrict - type: char **__restrict - type: int + - name: strtod_l + standards: + - stdc + return_type: double + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: locale_t + - name: strtof_l + standards: + - stdc + return_type: float + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: locale_t + - name: strtol_l + standards: + - stdc + return_type: long + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: int + - type: locale_t + - name: strtold_l + standards: + - stdc + return_type: long double + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: locale_t + - name: strtoll_l + standards: + - stdc + return_type: long long + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: int + - type: locale_t + - name: strtoul_l + standards: + - stdc + return_type: unsigned long + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: int + - type: locale_t + - name: strtoull_l + standards: + - stdc + return_type: unsigned long long + arguments: + - type: const char *__restrict + - type: char **__restrict + - type: int + - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 026cc72b458a77..2c61cb9d952951 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -1308,6 +1308,14 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"strtoul", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, FunctionSpec<"strtoull", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtof", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtod", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtold", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtol", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoll", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoul", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoull", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"malloc", RetValSpec, [ArgSpec]>, FunctionSpec<"calloc", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"realloc", RetValSpec, [ArgSpec, ArgSpec]>, diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index ce12e66cf3e57f..7fc68cb35e8489 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -428,6 +428,83 @@ if(NOT LLVM_LIBC_FULL_BUILD) return() endif() +add_entrypoint_object( + strtof_l + SRCS + strtof_l.cpp + HDRS + strtof_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_float +) + +add_entrypoint_object( + strtod_l + SRCS + strtod_l.cpp + HDRS + strtod_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_float +) + +add_entrypoint_object( + strtold_l + SRCS + strtold_l.cpp + HDRS + strtold_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_float +) + +add_entrypoint_object( + strtol_l + SRCS + strtol_l.cpp + HDRS + strtol_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_integer +) + +add_entrypoint_object( + strtoll_l + SRCS + strtoll_l.cpp + HDRS + strtoll_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_integer +) + +add_entrypoint_object( + strtoul_l + SRCS + strtoul_l.cpp + HDRS + strtoul_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_integer +) + +add_entrypoint_object( + strtoull_l + SRCS + strtoull_l.cpp + HDRS + strtoull_l.h + DEPENDS + libc.src.errno.errno + libc.src.__support.str_to_integer +) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) endif() diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp new file mode 100644 index 00000000000000..247314398315b0 --- /dev/null +++ b/libc/src/stdlib/strtod_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtod_l ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtod_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_float.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(double, strtod_l, + (const char *__restrict str, char **__restrict str_end, + locale_t)) { + auto result = internal::strtofloatingpoint(str); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result.value; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtod_l.h b/libc/src/stdlib/strtod_l.h new file mode 100644 index 00000000000000..06a8c893af2896 --- /dev/null +++ b/libc/src/stdlib/strtod_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtod_l ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOD_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOD_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +double strtod_l(const char *__restrict str, char **__restrict str_end, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOD_L_H diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp new file mode 100644 index 00000000000000..d54efa66e0846c --- /dev/null +++ b/libc/src/stdlib/strtof_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtof_l ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtof_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_float.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float, strtof_l, + (const char *__restrict str, char **__restrict str_end, + locale_t)) { + auto result = internal::strtofloatingpoint(str); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result.value; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtof_l.h b/libc/src/stdlib/strtof_l.h new file mode 100644 index 00000000000000..de629e3f36d458 --- /dev/null +++ b/libc/src/stdlib/strtof_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtof_l ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOF_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOF_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +float strtof_l(const char *__restrict str, char **__restrict str_end, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOF_L_H diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp new file mode 100644 index 00000000000000..f94aff1a0d7b2a --- /dev/null +++ b/libc/src/stdlib/strtol_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtol_l ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtol_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_integer.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(long, strtol_l, + (const char *__restrict str, char **__restrict str_end, + int base, locale_t)) { + auto result = internal::strtointeger(str, base); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtol_l.h b/libc/src/stdlib/strtol_l.h new file mode 100644 index 00000000000000..9f8c8553654d78 --- /dev/null +++ b/libc/src/stdlib/strtol_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtol_l ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOL_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOL_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +long strtol_l(const char *__restrict str, char **__restrict str_end, int base, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOL_L_H diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp new file mode 100644 index 00000000000000..d0c57f50246b5c --- /dev/null +++ b/libc/src/stdlib/strtold_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtold_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtold_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_float.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(long double, strtold_l, + (const char *__restrict str, char **__restrict str_end, + locale_t)) { + auto result = internal::strtofloatingpoint(str); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result.value; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtold_l.h b/libc/src/stdlib/strtold_l.h new file mode 100644 index 00000000000000..d694ce279b6e39 --- /dev/null +++ b/libc/src/stdlib/strtold_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtold_l ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +long double strtold_l(const char *__restrict str, char **__restrict str_end, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOLD_L_H diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp new file mode 100644 index 00000000000000..e82971d59c48d3 --- /dev/null +++ b/libc/src/stdlib/strtoll_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtoll_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtoll_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_integer.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(long long, strtoll_l, + (const char *__restrict str, char **__restrict str_end, + int base, locale_t)) { + auto result = internal::strtointeger(str, base); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtoll_l.h b/libc/src/stdlib/strtoll_l.h new file mode 100644 index 00000000000000..461fedb3df485d --- /dev/null +++ b/libc/src/stdlib/strtoll_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtoll_l ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +long long strtoll_l(const char *__restrict str, char **__restrict str_end, + int base, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOLL_L_H diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp new file mode 100644 index 00000000000000..74fce00a0ac3c4 --- /dev/null +++ b/libc/src/stdlib/strtoul_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtoul_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtoul_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_integer.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(unsigned long, strtoul_l, + (const char *__restrict str, char **__restrict str_end, + int base, locale_t)) { + auto result = internal::strtointeger(str, base); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtoul_l.h b/libc/src/stdlib/strtoul_l.h new file mode 100644 index 00000000000000..7c9f53a8acb31c --- /dev/null +++ b/libc/src/stdlib/strtoul_l.h @@ -0,0 +1,22 @@ +//===-- Implementation header for strtoul_l ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +unsigned long strtoul_l(const char *__restrict str, char **__restrict str_end, + int base, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOUL_L_H diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp new file mode 100644 index 00000000000000..2ea8a43a40ef2a --- /dev/null +++ b/libc/src/stdlib/strtoull_l.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of strtoull_l --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/strtoull_l.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/str_to_integer.h" +#include "src/errno/libc_errno.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(unsigned long long, strtoull_l, + (const char *__restrict str, char **__restrict str_end, + int base, locale_t)) { + auto result = internal::strtointeger(str, base); + if (result.has_error()) + libc_errno = result.error; + + if (str_end != nullptr) + *str_end = const_cast(str + result.parsed_len); + + return result; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strtoull_l.h b/libc/src/stdlib/strtoull_l.h new file mode 100644 index 00000000000000..c40f83ed1ffff2 --- /dev/null +++ b/libc/src/stdlib/strtoull_l.h @@ -0,0 +1,23 @@ +//===-- Implementation header for strtoull_l --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H +#define LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +unsigned long long strtoull_l(const char *__restrict str, + char **__restrict str_end, int base, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_STRTOULL_L_H From 5c019bdb7a008cf6465972d4affd8b2802465722 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 29 Aug 2024 14:20:15 -0500 Subject: [PATCH 42/72] [libc] Add support for 'string.h' locale variants (#105719) Summary: This adds the locale variants of the string functions. As previously, these do not use the locale information at all and simply copy the non-locale version which expects the "C" locale. --- libc/config/gpu/entrypoints.txt | 2 ++ libc/config/linux/x86_64/entrypoints.txt | 4 ++++ libc/include/string.h.def | 1 + libc/newhdrgen/yaml/string.yaml | 17 ++++++++++++++ libc/spec/stdc.td | 13 +++++++++++ libc/src/string/CMakeLists.txt | 19 ++++++++++++++++ libc/src/string/strcoll_l.cpp | 24 ++++++++++++++++++++ libc/src/string/strcoll_l.h | 21 ++++++++++++++++++ libc/src/string/strxfrm_l.cpp | 28 ++++++++++++++++++++++++ libc/src/string/strxfrm_l.h | 23 +++++++++++++++++++ 10 files changed, 152 insertions(+) create mode 100644 libc/src/string/strcoll_l.cpp create mode 100644 libc/src/string/strcoll_l.h create mode 100644 libc/src/string/strxfrm_l.cpp create mode 100644 libc/src/string/strxfrm_l.h diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index d8f78f0d174534..706f603b6ff56f 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -58,6 +58,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.string.strchrnul libc.src.string.strcmp libc.src.string.strcoll + libc.src.string.strcoll_l libc.src.string.strcpy libc.src.string.strcspn libc.src.string.strdup @@ -79,6 +80,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.string.strtok libc.src.string.strtok_r libc.src.string.strxfrm + libc.src.string.strxfrm_l # stdbit.h entrypoints libc.src.stdbit.stdc_bit_ceil_uc diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 0aa38c7afc76f4..3fd88fc0020e55 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -809,6 +809,10 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.stdlib.strtoul_l libc.src.stdlib.strtoull_l + # string.h entrypoints + libc.src.string.strcoll_l + libc.src.string.strxfrm_l + # assert.h entrypoints libc.src.assert.__assert_fail diff --git a/libc/include/string.h.def b/libc/include/string.h.def index 1bd2687db2beac..e180f0d2561d3a 100644 --- a/libc/include/string.h.def +++ b/libc/include/string.h.def @@ -11,6 +11,7 @@ #include "__llvm-libc-common.h" +#include "llvm-libc-types/locale_t.h" #include "llvm-libc-macros/null-macro.h" %%public_api() diff --git a/libc/newhdrgen/yaml/string.yaml b/libc/newhdrgen/yaml/string.yaml index 1d6e64bfb9cf60..af1750e91243ea 100644 --- a/libc/newhdrgen/yaml/string.yaml +++ b/libc/newhdrgen/yaml/string.yaml @@ -144,6 +144,14 @@ functions: arguments: - type: const char * - type: const char * + - name: strcoll_l + standards: + - stdc + return_type: int + arguments: + - type: const char * + - type: const char * + - type: locale_t - name: strcpy standards: - stdc @@ -300,3 +308,12 @@ functions: - type: char *__restrict - type: const char *__restrict - type: size_t + - name: strxfrm_l + standards: + - stdc + return_type: size_t + arguments: + - type: char *__restrict + - type: const char *__restrict + - type: size_t + - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 2c61cb9d952951..1742e1f7b0ef33 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -354,6 +354,11 @@ def StdC : StandardSpec<"stdc"> { RetValSpec, [ArgSpec, ArgSpec] >, + FunctionSpec< + "strcoll_l", + RetValSpec, + [ArgSpec, ArgSpec, ArgSpec] + >, FunctionSpec< "strncmp", RetValSpec, @@ -366,6 +371,14 @@ def StdC : StandardSpec<"stdc"> { ArgSpec, ArgSpec] >, + FunctionSpec< + "strxfrm_l", + RetValSpec, + [ArgSpec, + ArgSpec, + ArgSpec, + ArgSpec] + >, FunctionSpec< "strchr", RetValSpec, diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 56588ffafb86f0..787188ab3beb91 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -200,6 +200,14 @@ add_entrypoint_object( strcoll.h ) +add_entrypoint_object( + strcoll_l + SRCS + strcoll_l.cpp + HDRS + strcoll_l.h +) + add_entrypoint_object( strcpy SRCS @@ -441,6 +449,17 @@ add_entrypoint_object( .memory_utils.inline_memcpy ) +add_entrypoint_object( + strxfrm_l + SRCS + strxfrm_l.cpp + HDRS + strxfrm_l.h + DEPENDS + .string_utils + .memory_utils.inline_memcpy +) + add_entrypoint_object( memset_explicit SRCS diff --git a/libc/src/string/strcoll_l.cpp b/libc/src/string/strcoll_l.cpp new file mode 100644 index 00000000000000..f664a3c7c03f37 --- /dev/null +++ b/libc/src/string/strcoll_l.cpp @@ -0,0 +1,24 @@ +//===-- Implementation of strcoll_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/strcoll_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +// TODO: Add support for locales. +LLVM_LIBC_FUNCTION(int, strcoll_l, + (const char *left, const char *right, locale_t)) { + for (; *left && *left == *right; ++left, ++right) + ; + return static_cast(*left) - static_cast(*right); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/strcoll_l.h b/libc/src/string/strcoll_l.h new file mode 100644 index 00000000000000..97230fb811236c --- /dev/null +++ b/libc/src/string/strcoll_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for strcoll_l ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_STRCOLL_L_H +#define LLVM_LIBC_SRC_STRING_STRCOLL_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int strcoll_l(const char *left, const char *right, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STRING_STRCOLL_L_H diff --git a/libc/src/string/strxfrm_l.cpp b/libc/src/string/strxfrm_l.cpp new file mode 100644 index 00000000000000..ae758e1fcba6d8 --- /dev/null +++ b/libc/src/string/strxfrm_l.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of strxfrm_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/strxfrm_l.h" +#include "src/__support/macros/config.h" +#include "src/string/memory_utils/inline_memcpy.h" +#include "src/string/string_utils.h" + +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE_DECL { + +// TODO: Add support for locales. +LLVM_LIBC_FUNCTION(size_t, strxfrm_l, + (char *__restrict dest, const char *__restrict src, size_t n, + locale_t)) { + size_t len = internal::string_length(src); + if (n > len) + inline_memcpy(dest, src, len + 1); + return len; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/strxfrm_l.h b/libc/src/string/strxfrm_l.h new file mode 100644 index 00000000000000..af0f181601184b --- /dev/null +++ b/libc/src/string/strxfrm_l.h @@ -0,0 +1,23 @@ +//===-- Implementation header for strxfrm_l ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_STRXFRM_L_H +#define LLVM_LIBC_SRC_STRING_STRXFRM_L_H + +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" +#include // For size_t + +namespace LIBC_NAMESPACE_DECL { + +size_t strxfrm_l(char *__restrict dest, const char *__restrict src, size_t n, + locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STRING_STRXFRM_L_H From ba5e8fcecea20da0a796b85e20d6292eb1447b6c Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Thu, 29 Aug 2024 15:21:06 -0400 Subject: [PATCH 43/72] [flang] Adjust execute_command_line intrinsic return values for AIX (NFC) (#106472) --- flang/unittests/Runtime/CommandTest.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp index 20bd7a5b5ff35a..b0c43ba01d8f33 100644 --- a/flang/unittests/Runtime/CommandTest.cpp +++ b/flang/unittests/Runtime/CommandTest.cpp @@ -348,10 +348,14 @@ TEST_F(ZeroArguments, ECLGeneralErrorCommandErrorSync) { RTNAME(ExecuteCommandLine) (*command.get(), wait, exitStat.get(), cmdStat.get(), cmdMsg.get()); -#ifdef _WIN32 +#if defined(_WIN32) CheckDescriptorEqInt(exitStat.get(), 1); CheckDescriptorEqInt(cmdStat.get(), 6); CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX"); +#elif defined(_AIX) + CheckDescriptorEqInt(exitStat.get(), 2); + CheckDescriptorEqInt(cmdStat.get(), 6); + CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX"); #else CheckDescriptorEqInt(exitStat.get(), 1); CheckDescriptorEqInt(cmdStat.get(), 3); From 74938ab84dbfdedc6af7a276ebd67201b5eb78e5 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Thu, 29 Aug 2024 15:21:25 -0400 Subject: [PATCH 44/72] [AMDGPU][True16][MC] add true16/fake16 flag to gfx12 dasm tests (#106469) add true16/fake16 flag to gfx12 dasm tests including vop1, vop1_dpp, vop3_from_vop1 and vop3_from_vop1_dpp. This is a test only change. --- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 356 +++++++++++++----- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 104 ++++- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 328 ++++++++++------ .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 297 ++++++++++----- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 87 +++-- 5 files changed, 817 insertions(+), 355 deletions(-) diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index ac45962e1743e4..b4aff84eeb69a4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1,5 +1,7 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s # GFX12: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -43,48 +45,70 @@ # GFX12: v_bfrev_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30 -# GFX12: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_ceil_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_ceil_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_ceil_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_ceil_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_ceil_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_ceil_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_ceil_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_ceil_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -1231,48 +1255,70 @@ # GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30 -# GFX12: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_exp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_exp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_exp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_exp_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_exp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_exp_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb0,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xb0,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_exp_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb0,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xb0,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_exp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -1315,48 +1361,70 @@ # GFX12: v_exp_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30 -# GFX12: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_floor_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_floor_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_floor_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_floor_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_floor_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_floor_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] +0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_floor_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] +0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_floor_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -1651,48 +1719,70 @@ # GFX12: v_frexp_mant_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x3d,0x30 -# GFX12: v_log_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_log_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_log_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_log_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xae,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_log_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_log_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_log_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_log_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xae,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xae,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_log_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xae,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xae,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_log_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -2029,48 +2119,70 @@ # GFX12: v_not_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x0d,0x30 -# GFX12: v_rcp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_rcp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_rcp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_rcp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_rcp_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_rcp_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_rcp_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xa8,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xa8,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_rcp_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xa8,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xa8,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_rcp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -2239,48 +2351,70 @@ # GFX12: v_rndne_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x3d,0x30 -# GFX12: v_rsq_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_rsq_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_rsq_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xac,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_rsq_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_rsq_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_rsq_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_rsq_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xac,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xac,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_rsq_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xac,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xac,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_rsq_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff @@ -2449,48 +2583,70 @@ # GFX12: v_sin_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x3d,0x30 -# GFX12: v_sqrt_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_sqrt_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_sqrt_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_sqrt_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_sqrt_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_sqrt_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] 0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_sqrt_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xaa,0x0a,0x7f,0x81,0x60,0x01,0x13] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xaa,0x0a,0x7f,0x81,0x60,0x01,0x13 + +# GFX12-REAL16: v_sqrt_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xaa,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xfa,0xaa,0xfe,0x7f,0xff,0x6f,0x3d,0x30 + # GFX12: v_sqrt_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 957c425008c872..04650eaec1180d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -1,5 +1,7 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s # GFX12: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -7,12 +9,22 @@ # GFX12: v_bfrev_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_ceil_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_ceil_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_ceil_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_ceil_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_ceil_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -187,24 +199,44 @@ # GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00] 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00 -# GFX12: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_exp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_exp_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xb0,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_exp_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb0,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xb0,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_exp_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_exp_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_floor_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_floor_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_floor_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_floor_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_floor_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -247,12 +279,22 @@ # GFX12: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_log_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_log_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_log_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_log_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_log_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xae,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xae,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_log_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xae,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xae,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_log_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -301,12 +343,22 @@ # GFX12: v_not_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_rcp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rcp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rcp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_rcp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_rcp_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xa8,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xa8,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_rcp_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xa8,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xa8,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_rcp_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -331,12 +383,22 @@ # GFX12: v_rndne_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_rsq_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rsq_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rsq_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_rsq_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_rsq_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xac,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xac,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_rsq_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xac,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xac,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_rsq_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05 @@ -361,12 +423,22 @@ # GFX12: v_sin_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sqrt_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sqrt_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_sqrt_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00] 0xea,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_sqrt_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xaa,0x0a,0x7f,0x81,0x77,0x39,0x05] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xe9,0xaa,0x0a,0x7f,0x81,0x77,0x39,0x05 + +# GFX12-REAL16: v_sqrt_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xaa,0xfe,0x7f,0xff,0x00,0x00,0x00] +# COM: GFX12-FAKE16: warning: invalid instruction encoding +0xea,0xaa,0xfe,0x7f,0xff,0x00,0x00,0x00 + # GFX12: v_sqrt_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 5c7cc3a8e223e6..c0aab0692bbb5b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s # GFX12: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00 @@ -45,49 +46,64 @@ # GFX12: v_bfrev_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX12: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_ceil_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_ceil_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00] @@ -513,7 +529,7 @@ # GFX12: v_cvt_f16_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cvt_f16_i16_e64 v5, 0x3800 mul:2 +# GFX12: v_cvt_f16_i16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] 0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08 # GFX12: v_cvt_f16_i16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10] @@ -558,7 +574,7 @@ # GFX12: v_cvt_f16_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cvt_f16_u16_e64 v5, 0x3800 mul:2 +# GFX12: v_cvt_f16_u16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] 0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08 # GFX12: v_cvt_f16_u16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10] @@ -1260,7 +1276,7 @@ # GFX12: v_cvt_i32_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64 v5, 0x3800 +# GFX12: v_cvt_i32_i16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00 # GFX12: v_cvt_i32_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00] @@ -1611,7 +1627,7 @@ # GFX12: v_cvt_u32_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cvt_u32_u16_e64 v5, 0x3800 +# GFX12: v_cvt_u32_u16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00 # GFX12: v_cvt_u32_u16_e64 v5, src_scc ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00] @@ -1620,49 +1636,64 @@ # GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_exp_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_exp_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_exp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00] @@ -1710,49 +1741,64 @@ # GFX12: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] 0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf -# GFX12: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_floor_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_floor_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_floor_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00] @@ -2214,49 +2260,64 @@ # GFX12: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf -# GFX12: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_log_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_log_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_log_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00] @@ -2448,7 +2509,7 @@ # GFX12: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, 0x3800 +# GFX12: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00 # GFX12: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] @@ -2502,49 +2563,64 @@ # GFX12: v_not_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX12: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_rcp_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_rcp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00] @@ -2799,49 +2875,64 @@ # GFX12: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf -# GFX12: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_rsq_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_rsq_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00] @@ -3060,49 +3151,64 @@ # GFX12: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] 0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf -# GFX12: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_sqrt_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] 0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 # GFX12: v_sqrt_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index f9c768e3e02665..95cfad146fc163 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s # GFX12: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff @@ -42,46 +43,60 @@ # GFX12: v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -1260,46 +1275,60 @@ # GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_exp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_exp_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -1344,46 +1373,60 @@ # GFX12: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -1680,46 +1723,60 @@ # GFX12: v_frexp_mant_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xc0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xc0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_log_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_log_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -1932,46 +1989,60 @@ # GFX12: v_not_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_rcp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_rcp_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xaa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -2142,46 +2213,60 @@ # GFX12: v_rndne_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xa3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_rsq_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_rsq_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xae,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] @@ -2352,46 +2437,60 @@ # GFX12: v_sin_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xb5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 # GFX12: v_sqrt_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index eccd691855774e..a9474c7c4fe7f0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -1,4 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s # GFX12: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 @@ -6,16 +7,20 @@ # GFX12: v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -306,16 +311,20 @@ # GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_exp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xd8,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_exp_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -330,16 +339,20 @@ # GFX12: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -414,16 +427,20 @@ # GFX12: v_frexp_mant_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xc0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xc0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_log_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_log_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xd7,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_log_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -462,16 +479,20 @@ # GFX12: v_not_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb7,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xb7,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rcp_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_rcp_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xd4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_rcp_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xaa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -522,16 +543,20 @@ # GFX12: v_rndne_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xa3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_rsq_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_rsq_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xd6,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_rsq_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xae,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] @@ -576,16 +601,20 @@ # GFX12: v_sin_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xb5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sqrt_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0xff,0x81,0xd5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX12: v_sqrt_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 59762a0ecf64cbf6ac20c41ae75666cd87519f26 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 29 Aug 2024 12:08:33 -0700 Subject: [PATCH 45/72] [RISCV] Add coverage for <3 x float> reduction with neutral start We can do slightly better on the neutral value when we have nsz. --- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 237 +++++++++++------- 1 file changed, 141 insertions(+), 96 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 566c9070eab512..5d5807cbadbad5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -489,6 +489,51 @@ define float @vreduce_ord_fadd_v7f32(ptr %x, float %s) { ret float %red } +define float @vreduce_fadd_v7f32_neutralstart(ptr %x) { +; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call reassoc float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v) + ret float %red +} + +define float @vreduce_fadd_v7f32_neutralstart_nsz(ptr %x) { +; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart_nsz: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredosum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call nsz float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v) + ret float %red +} + +define float @vreduce_fadd_v7f32_neutralstart_fast(ptr %x) { +; CHECK-LABEL: vreduce_fadd_v7f32_neutralstart_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call fast float @llvm.vector.reduce.fadd.v7f32(float -0.0, <7 x float> %v) + ret float %red +} + declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) @@ -1683,12 +1728,12 @@ define float @vreduce_fminimum_v2f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB104_2 +; CHECK-NEXT: beqz a0, .LBB107_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB104_2: +; CHECK-NEXT: .LBB107_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1719,12 +1764,12 @@ define float @vreduce_fminimum_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB106_2 +; CHECK-NEXT: beqz a0, .LBB109_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB106_2: +; CHECK-NEXT: .LBB109_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1758,12 +1803,12 @@ define float @vreduce_fminimum_v7f32(ptr %x) { ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10, v0.t -; CHECK-NEXT: beqz a0, .LBB108_2 +; CHECK-NEXT: beqz a0, .LBB111_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: .LBB111_2: ; CHECK-NEXT: lui a0, 522240 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vfredmin.vs v8, v8, v10 @@ -1798,12 +1843,12 @@ define float @vreduce_fminimum_v8f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB110_2 +; CHECK-NEXT: beqz a0, .LBB113_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB110_2: +; CHECK-NEXT: .LBB113_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1834,12 +1879,12 @@ define float @vreduce_fminimum_v16f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB112_2 +; CHECK-NEXT: beqz a0, .LBB115_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB112_2: +; CHECK-NEXT: .LBB115_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1871,12 +1916,12 @@ define float @vreduce_fminimum_v32f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB114_2 +; CHECK-NEXT: beqz a0, .LBB117_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB114_2: +; CHECK-NEXT: .LBB117_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1926,15 +1971,15 @@ define float @vreduce_fminimum_v64f32(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB116_2 +; CHECK-NEXT: beqz a0, .LBB119_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB116_3 -; CHECK-NEXT: .LBB116_2: +; CHECK-NEXT: j .LBB119_3 +; CHECK-NEXT: .LBB119_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB116_3: +; CHECK-NEXT: .LBB119_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2048,15 +2093,15 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB118_2 +; CHECK-NEXT: beqz a0, .LBB121_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB118_3 -; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: j .LBB121_3 +; CHECK-NEXT: .LBB121_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB118_3: +; CHECK-NEXT: .LBB121_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2102,12 +2147,12 @@ define double @vreduce_fminimum_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB120_2 +; CHECK-NEXT: beqz a0, .LBB123_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI120_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI120_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI123_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI123_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB120_2: +; CHECK-NEXT: .LBB123_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2138,12 +2183,12 @@ define double @vreduce_fminimum_v4f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB122_2 +; CHECK-NEXT: beqz a0, .LBB125_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI122_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI122_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI125_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI125_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB122_2: +; CHECK-NEXT: .LBB125_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2174,12 +2219,12 @@ define double @vreduce_fminimum_v8f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB124_2 +; CHECK-NEXT: beqz a0, .LBB127_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI124_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI124_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI127_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI127_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB124_2: +; CHECK-NEXT: .LBB127_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2210,12 +2255,12 @@ define double @vreduce_fminimum_v16f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB126_2 +; CHECK-NEXT: beqz a0, .LBB129_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI126_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI126_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI129_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI129_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB126_2: +; CHECK-NEXT: .LBB129_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2263,15 +2308,15 @@ define double @vreduce_fminimum_v32f64(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB128_2 +; CHECK-NEXT: beqz a0, .LBB131_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI128_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI128_0)(a0) -; CHECK-NEXT: j .LBB128_3 -; CHECK-NEXT: .LBB128_2: +; CHECK-NEXT: lui a0, %hi(.LCPI131_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI131_0)(a0) +; CHECK-NEXT: j .LBB131_3 +; CHECK-NEXT: .LBB131_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB128_3: +; CHECK-NEXT: .LBB131_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2383,15 +2428,15 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB130_2 +; CHECK-NEXT: beqz a0, .LBB133_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI130_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI130_0)(a0) -; CHECK-NEXT: j .LBB130_3 -; CHECK-NEXT: .LBB130_2: +; CHECK-NEXT: lui a0, %hi(.LCPI133_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI133_0)(a0) +; CHECK-NEXT: j .LBB133_3 +; CHECK-NEXT: .LBB133_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB130_3: +; CHECK-NEXT: .LBB133_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2436,12 +2481,12 @@ define float @vreduce_fmaximum_v2f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB132_2 +; CHECK-NEXT: beqz a0, .LBB135_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB132_2: +; CHECK-NEXT: .LBB135_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2472,12 +2517,12 @@ define float @vreduce_fmaximum_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB134_2 +; CHECK-NEXT: beqz a0, .LBB137_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB134_2: +; CHECK-NEXT: .LBB137_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2511,12 +2556,12 @@ define float @vreduce_fmaximum_v7f32(ptr %x) { ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10, v0.t -; CHECK-NEXT: beqz a0, .LBB136_2 +; CHECK-NEXT: beqz a0, .LBB139_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB136_2: +; CHECK-NEXT: .LBB139_2: ; CHECK-NEXT: lui a0, 1046528 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vfredmax.vs v8, v8, v10 @@ -2551,12 +2596,12 @@ define float @vreduce_fmaximum_v8f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB138_2 +; CHECK-NEXT: beqz a0, .LBB141_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB138_2: +; CHECK-NEXT: .LBB141_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2587,12 +2632,12 @@ define float @vreduce_fmaximum_v16f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB140_2 +; CHECK-NEXT: beqz a0, .LBB143_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB140_2: +; CHECK-NEXT: .LBB143_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2624,12 +2669,12 @@ define float @vreduce_fmaximum_v32f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB142_2 +; CHECK-NEXT: beqz a0, .LBB145_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB142_2: +; CHECK-NEXT: .LBB145_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2679,15 +2724,15 @@ define float @vreduce_fmaximum_v64f32(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB144_2 +; CHECK-NEXT: beqz a0, .LBB147_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB144_3 -; CHECK-NEXT: .LBB144_2: +; CHECK-NEXT: j .LBB147_3 +; CHECK-NEXT: .LBB147_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB144_3: +; CHECK-NEXT: .LBB147_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2801,15 +2846,15 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB146_2 +; CHECK-NEXT: beqz a0, .LBB149_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB146_3 -; CHECK-NEXT: .LBB146_2: +; CHECK-NEXT: j .LBB149_3 +; CHECK-NEXT: .LBB149_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB146_3: +; CHECK-NEXT: .LBB149_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2855,12 +2900,12 @@ define double @vreduce_fmaximum_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB148_2 +; CHECK-NEXT: beqz a0, .LBB151_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI148_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI148_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI151_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI151_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB148_2: +; CHECK-NEXT: .LBB151_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2891,12 +2936,12 @@ define double @vreduce_fmaximum_v4f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB150_2 +; CHECK-NEXT: beqz a0, .LBB153_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI150_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI150_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI153_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI153_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB150_2: +; CHECK-NEXT: .LBB153_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2927,12 +2972,12 @@ define double @vreduce_fmaximum_v8f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB152_2 +; CHECK-NEXT: beqz a0, .LBB155_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI152_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI152_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI155_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI155_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB152_2: +; CHECK-NEXT: .LBB155_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2963,12 +3008,12 @@ define double @vreduce_fmaximum_v16f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB154_2 +; CHECK-NEXT: beqz a0, .LBB157_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI154_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI154_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI157_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI157_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB154_2: +; CHECK-NEXT: .LBB157_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -3016,15 +3061,15 @@ define double @vreduce_fmaximum_v32f64(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB156_2 +; CHECK-NEXT: beqz a0, .LBB159_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI156_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI156_0)(a0) -; CHECK-NEXT: j .LBB156_3 -; CHECK-NEXT: .LBB156_2: +; CHECK-NEXT: lui a0, %hi(.LCPI159_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI159_0)(a0) +; CHECK-NEXT: j .LBB159_3 +; CHECK-NEXT: .LBB159_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB156_3: +; CHECK-NEXT: .LBB159_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -3136,15 +3181,15 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB158_2 +; CHECK-NEXT: beqz a0, .LBB161_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI158_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI158_0)(a0) -; CHECK-NEXT: j .LBB158_3 -; CHECK-NEXT: .LBB158_2: +; CHECK-NEXT: lui a0, %hi(.LCPI161_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI161_0)(a0) +; CHECK-NEXT: j .LBB161_3 +; CHECK-NEXT: .LBB161_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB158_3: +; CHECK-NEXT: .LBB161_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 From d5c292d8ef590f64d26c16d12afebb6ad7f50373 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Aug 2024 12:35:50 -0700 Subject: [PATCH 46/72] [GISel][RISCV] Correctly handle scalable vector shuffles of pointer vectors in IRTranslator. (#106580) --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 11 ++++----- llvm/lib/CodeGen/MachineVerifier.cpp | 4 ++-- .../GlobalISel/irtranslator/shufflevector.ll | 23 +++++++++++++++++-- .../MachineVerifier/test_g_splat_vector.mir | 4 ++-- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 968d0a2a5c75e4..b290d7fb4ce4a1 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3183,15 +3183,14 @@ bool IRTranslator::translateExtractElement(const User &U, bool IRTranslator::translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder) { - // A ShuffleVector that has operates on scalable vectors is a splat vector - // where the value of the splat vector is the 0th element of the first - // operand, since the index mask operand is the zeroinitializer (undef and + // A ShuffleVector that operates on scalable vectors is a splat vector where + // the value of the splat vector is the 0th element of the first operand, + // since the index mask operand is the zeroinitializer (undef and // poison are treated as zeroinitializer here). if (U.getOperand(0)->getType()->isScalableTy()) { - Value *Op0 = U.getOperand(0); + Register Val = getOrCreateVReg(*U.getOperand(0)); auto SplatVal = MIRBuilder.buildExtractVectorElementConstant( - LLT::scalar(Op0->getType()->getScalarSizeInBits()), - getOrCreateVReg(*Op0), 0); + MRI->getType(Val).getElementType(), Val, 0); MIRBuilder.buildSplatVector(getOrCreateVReg(U), SplatVal); return true; } diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 5e9bb4c27ffbdf..759201ed9dadc7 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1835,8 +1835,8 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { break; } - if (!SrcTy.isScalar()) { - report("Source type must be a scalar", MI); + if (!SrcTy.isScalar() && !SrcTy.isPointer()) { + report("Source type must be a scalar or pointer", MI); break; } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll index 7ea67073bc28d2..89c7bfe81d5f98 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/shufflevector.ll @@ -1770,5 +1770,24 @@ define @shufflevector_nxv16i64_2( %a) { ret %b } - - +define @shufflevector_nxv1p0_0() { + ; RV32-LABEL: name: shufflevector_nxv1p0_0 + ; RV32: bb.1 (%ir-block.0): + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](), [[C]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[EVEC]](p0) + ; RV32-NEXT: $v8 = COPY [[SPLAT_VECTOR]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: shufflevector_nxv1p0_0 + ; RV64: bb.1 (%ir-block.0): + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; RV64-NEXT: [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](), [[C]](s64) + ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[EVEC]](p0) + ; RV64-NEXT: $v8 = COPY [[SPLAT_VECTOR]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %a = shufflevector poison, poison, zeroinitializer + ret %a +} diff --git a/llvm/test/MachineVerifier/test_g_splat_vector.mir b/llvm/test/MachineVerifier/test_g_splat_vector.mir index 00074349776fa7..a5bde496a3f22c 100644 --- a/llvm/test/MachineVerifier/test_g_splat_vector.mir +++ b/llvm/test/MachineVerifier/test_g_splat_vector.mir @@ -16,10 +16,10 @@ body: | ; CHECK: Destination type must be a scalable vector %4:_(<2 x s32>) = G_SPLAT_VECTOR %0 - ; CHECK: Source type must be a scalar + ; CHECK: Source type must be a scalar or pointer %5:_() = G_SPLAT_VECTOR %1 - ; CHECK: Source type must be a scalar + ; CHECK: Source type must be a scalar or pointer %6:_() = G_SPLAT_VECTOR %2 ; CHECK: Element type of the destination must be the same size or smaller than the source type From aeedab77b596f858b0c53923657fc8c190d48ea8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 29 Aug 2024 12:32:25 -0700 Subject: [PATCH 47/72] [SLP]Correctly decide if the non-power-of-2 number of stores can be vectorized. Need to consider the maximum type size in the graph before doing attempt for the vectorization of non-power-of-2 number of elements, which may be less than MinVF. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 5 +- .../Transforms/SLPVectorizer/X86/odd_store.ll | 62 ++++++++----------- 2 files changed, 28 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e77db3cbd81fe5..775fa9ba75cfb7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16476,8 +16476,9 @@ bool SLPVectorizerPass::vectorizeStores( // First try vectorizing with a non-power-of-2 VF. At the moment, only // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. - unsigned CandVF = Operands.size(); - if (has_single_bit(CandVF + 1) && CandVF <= MaxRegVF) + unsigned CandVF = + std::clamp(Operands.size(), MaxVF, MaxRegVF); + if (has_single_bit(CandVF + 1)) NonPowerOf2VF = CandVF; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll index 5f2c42d5c2dec8..f1989712657203 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll @@ -9,43 +9,31 @@ ;} define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) { -; NON-POW2-LABEL: @foo( -; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer -; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]] -; NON-POW2-NEXT: [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double> -; NON-POW2-NEXT: [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], -; NON-POW2-NEXT: [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8> -; NON-POW2-NEXT: store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1 -; NON-POW2-NEXT: ret i32 undef -; -; POW2-ONLY-LABEL: @foo( -; POW2-ONLY-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]] -; POW2-ONLY-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double -; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00 -; POW2-ONLY-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8 -; POW2-ONLY-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1 -; POW2-ONLY-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11 -; POW2-ONLY-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 -; POW2-ONLY-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]] -; POW2-ONLY-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double -; POW2-ONLY-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00 -; POW2-ONLY-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8 -; POW2-ONLY-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 -; POW2-ONLY-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 -; POW2-ONLY-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12 -; POW2-ONLY-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4 -; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]] -; POW2-ONLY-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double -; POW2-ONLY-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00 -; POW2-ONLY-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8 -; POW2-ONLY-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2 -; POW2-ONLY-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1 -; POW2-ONLY-NEXT: ret i32 undef +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double +; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8 +; CHECK-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]] +; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double +; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00 +; CHECK-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 +; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]] +; CHECK-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double +; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00 +; CHECK-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2 +; CHECK-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1 +; CHECK-NEXT: ret i32 undef ; %1 = getelementptr inbounds float, ptr %B, i64 10 %2 = load float, ptr %1, align 4 From f08f9cd9713332c939889ab34f5355b77f12f82b Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 29 Aug 2024 12:56:15 -0700 Subject: [PATCH 48/72] [HWASan] remove incorrectly inferred attributes (#106565) assume all functions used in a HWASan module potentially touch shadow memory (and short granules). --- .../Instrumentation/HWAddressSanitizer.cpp | 26 +- .../HWAddressSanitizer/RISCV/alloca.ll | 156 +++++----- .../HWAddressSanitizer/RISCV/basic.ll | 270 +++++++++--------- .../HWAddressSanitizer/alloca.ll | 160 ++++++----- .../HWAddressSanitizer/attrinfer.ll | 14 + .../HWAddressSanitizer/basic.ll | 208 +++++++------- .../HWAddressSanitizer/fixed-shadow.ll | 4 +- .../hwasan-pass-second-run.ll | 4 +- .../HWAddressSanitizer/mem-attr.ll | 2 +- 9 files changed, 437 insertions(+), 407 deletions(-) create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 95433a216b168d..f5faf117f69bdd 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -598,6 +598,24 @@ void HWAddressSanitizer::initializeModule() { LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); TargetTriple = Triple(M.getTargetTriple()); + for (auto &F : M.functions()) { + // Remove memory attributes that are invalid with HWASan. + // HWASan checks read from shadow, which invalidates memory(argmem: *) + // Short granule checks on function arguments read from the argument memory + // (last byte of the granule), which invalidates writeonly. + // + // This is not only true for sanitized functions, because AttrInfer can + // infer those attributes on libc functions, which is not true if those + // are instrumented (Android) or intercepted. + + // nobuiltin makes sure later passes don't restore assumptions about + // the function. + F.addFnAttr(llvm::Attribute::NoBuiltin); + F.removeFnAttr(llvm::Attribute::Memory); + for (auto &A : F.args()) + A.removeAttr(llvm::Attribute::WriteOnly); + } + // x86_64 currently has two modes: // - Intel LAM (default) // - pointer aliasing (heap only) @@ -1622,14 +1640,6 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, assert(!ShadowBase); - // Remove memory attributes that are about to become invalid. - // HWASan checks read from shadow, which invalidates memory(argmem: *) - // Short granule checks on function arguments read from the argument memory - // (last byte of the granule), which invalidates writeonly. - F.removeFnAttr(llvm::Attribute::Memory); - for (auto &A : F.args()) - A.removeAttr(llvm::Attribute::WriteOnly); - BasicBlock::iterator InsertPt = F.getEntryBlock().begin(); IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt); emitPrologue(EntryIRB, diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll index 23b1043c700165..032168e28421b9 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll @@ -33,7 +33,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca -; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca -; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; entry: %x = alloca i32, align 4 @@ -131,15 +131,17 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } +; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } +; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -147,15 +149,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; DYNAMIC-SHADOW: [[META9]] = !{null} -; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; DYNAMIC-SHADOW: [[META10]] = !{null} +; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -163,13 +166,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META9]] = !{null} -; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META10]] = !{null} +; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll index 9cebe2e845f772..dc2d11cb4b3538 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll @@ -9,8 +9,6 @@ ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW -; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor] -; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux" @@ -32,7 +30,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; CHECK-NEXT: br label [[TMP13]] @@ -68,7 +66,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP13]] @@ -88,7 +86,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -108,10 +106,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -120,13 +118,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -145,7 +143,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -165,10 +163,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -177,13 +175,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -212,7 +210,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; CHECK-NEXT: br label [[TMP13]] @@ -248,7 +246,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP13]] @@ -268,7 +266,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -288,10 +286,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -300,13 +298,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -325,7 +323,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -345,10 +343,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -357,13 +355,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -392,7 +390,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; CHECK-NEXT: br label [[TMP13]] @@ -428,7 +426,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP13]] @@ -448,7 +446,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -468,10 +466,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -480,13 +478,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -505,7 +503,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -525,10 +523,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -537,13 +535,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -572,7 +570,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; CHECK-NEXT: br label [[TMP13]] @@ -608,7 +606,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP13]] @@ -628,7 +626,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -648,10 +646,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -660,13 +658,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -685,7 +683,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -705,10 +703,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -717,13 +715,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -752,7 +750,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; CHECK-NEXT: br label [[TMP13]] @@ -788,7 +786,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP13]] @@ -808,7 +806,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -828,10 +826,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -840,13 +838,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -865,7 +863,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -885,10 +883,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -897,13 +895,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1013,7 +1011,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; CHECK-NEXT: br label [[TMP13]] @@ -1049,7 +1047,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1069,7 +1067,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1089,10 +1087,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1101,13 +1099,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1126,7 +1124,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1146,10 +1144,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1158,13 +1156,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1193,7 +1191,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; CHECK-NEXT: br label [[TMP13]] @@ -1229,7 +1227,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1249,7 +1247,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1269,10 +1267,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1281,13 +1279,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1306,7 +1304,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1326,10 +1324,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1338,13 +1336,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1373,7 +1371,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; CHECK-NEXT: br label [[TMP13]] @@ -1409,7 +1407,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1429,7 +1427,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1449,10 +1447,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1461,13 +1459,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1486,7 +1484,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1506,10 +1504,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1518,13 +1516,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1553,7 +1551,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; CHECK-NEXT: br label [[TMP13]] @@ -1589,7 +1587,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1609,7 +1607,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1629,10 +1627,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1641,13 +1639,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1666,7 +1664,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1686,10 +1684,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1698,13 +1696,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1733,7 +1731,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; CHECK-NEXT: br label [[TMP13]] @@ -1769,7 +1767,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1789,7 +1787,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1809,10 +1807,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1821,13 +1819,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1846,7 +1844,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1866,10 +1864,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1878,13 +1876,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -2060,43 +2058,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -2108,43 +2106,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll index 4bd23ea76c159b..0f74736dc232ea 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll @@ -34,7 +34,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] ; entry: %x = alloca i32, align 4 @@ -112,13 +112,13 @@ entry: define void @test_vscale_alloca() sanitize_hwaddress { ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] { ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; DYNAMIC-SHADOW-NEXT: ret void ; ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] { ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; ZERO-BASED-SHADOW-NEXT: ret void @@ -150,15 +150,17 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } +; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } +; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -166,15 +168,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; DYNAMIC-SHADOW: [[META9]] = !{null} -; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; DYNAMIC-SHADOW: [[META10]] = !{null} +; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -182,13 +185,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META9]] = !{null} -; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} +; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META10]] = !{null} +; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll new file mode 100644 index 00000000000000..eeb51aeda1000b --- /dev/null +++ b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll @@ -0,0 +1,14 @@ +; Standard library functions get inferred attributes, some of which are not +; correct when building for HWASan. + +; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android10000" + +declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0 + +attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" } + +; CHECK-NOT: memory(argmem: write) +; CHECK: nobuiltin diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll index 4212293f42545e..1e74f2891a2e3c 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll @@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP9]] @@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP9]] @@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP9]] @@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP9]] @@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP9]] @@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP9]] @@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP9]] @@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1542,43 +1542,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -1590,43 +1590,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll index 980189c5607f31..f72fc0a9720e4a 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll @@ -194,7 +194,7 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] @@ -206,7 +206,7 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll index 00614b603fe799..2635dfb75ed98f 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll @@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: @__hwasan_shadow = external global [0 x i8] ;. define i8 @test_load8(ptr %a) sanitize_hwaddress { -; CHECK: Function Attrs: sanitize_hwaddress +; CHECK: Function Attrs: nobuiltin sanitize_hwaddress ; CHECK-LABEL: define i8 @test_load8 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -33,7 +33,7 @@ entry: ret i8 %b } ;. -; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress } +; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ;. ; CHECK: [[META0]] = !{ptr @hwasan.note} diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll index c0e370f20213aa..919eacb2951f5e 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll @@ -11,5 +11,5 @@ entry: ret void } -; CHECK: attributes #0 = { sanitize_hwaddress uwtable } +; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable } attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable } From 0141a3cde4d8f2c8ff9e957f981f37e65a69a325 Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Thu, 29 Aug 2024 21:59:03 +0200 Subject: [PATCH 49/72] [analyzer] Fix nullptr dereference for symbols from pointer invalidation (#106568) As reported in https://github.com/llvm/llvm-project/pull/105648#issuecomment-2317144635 commit 08ad8dc7154bf3ab79f750e6d5fb7df597c7601a introduced a nullptr dereference in the case when store contains a binding to a symbol that has no origin region associated with it, such as the symbol generated when a pointer is passed to an opaque function. --- .../Checkers/StackAddrEscapeChecker.cpp | 5 ++++- clang/test/Analysis/stack-addr-ps.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp index 20232405d572d2..ec577c36188e6c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp @@ -308,7 +308,10 @@ static const MemSpaceRegion *getStackOrGlobalSpaceRegion(const MemRegion *R) { const MemRegion *getOriginBaseRegion(const MemRegion *Reg) { Reg = Reg->getBaseRegion(); while (const auto *SymReg = dyn_cast(Reg)) { - Reg = SymReg->getSymbol()->getOriginRegion()->getBaseRegion(); + const auto *OriginReg = SymReg->getSymbol()->getOriginRegion(); + if (!OriginReg) + break; + Reg = OriginReg->getBaseRegion(); } return Reg; } diff --git a/clang/test/Analysis/stack-addr-ps.c b/clang/test/Analysis/stack-addr-ps.c index 138b8c16b02bde..7d7294455f1dbe 100644 --- a/clang/test/Analysis/stack-addr-ps.c +++ b/clang/test/Analysis/stack-addr-ps.c @@ -126,3 +126,21 @@ void caller_for_nested_leaking() { int *ptr = 0; caller_mid_for_nested_leaking(&ptr); } + +// This used to crash StackAddrEscapeChecker because +// it features a symbol conj_$1{struct c *, LC1, S763, #1} +// that has no origin region. +struct a { + int member; +}; + +struct c { + struct a *nested_ptr; +}; +void opaque(struct c*); +struct c* get_c(void); +void no_crash_for_symbol_without_origin_region(void) { + struct c *ptr = get_c(); + opaque(ptr); + ptr->nested_ptr->member++; +} // No crash at the end of the function From 66927fb95abef9327b453d7213c5df7d641269be Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 29 Aug 2024 13:06:21 -0700 Subject: [PATCH 50/72] Revert "[HWASan] remove incorrectly inferred attributes" (#106622) Reverts llvm/llvm-project#106565 Broke clang tests --- .../Instrumentation/HWAddressSanitizer.cpp | 26 +- .../HWAddressSanitizer/RISCV/alloca.ll | 156 +++++----- .../HWAddressSanitizer/RISCV/basic.ll | 270 +++++++++--------- .../HWAddressSanitizer/alloca.ll | 160 +++++------ .../HWAddressSanitizer/attrinfer.ll | 14 - .../HWAddressSanitizer/basic.ll | 208 +++++++------- .../HWAddressSanitizer/fixed-shadow.ll | 4 +- .../hwasan-pass-second-run.ll | 4 +- .../HWAddressSanitizer/mem-attr.ll | 2 +- 9 files changed, 407 insertions(+), 437 deletions(-) delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index f5faf117f69bdd..95433a216b168d 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -598,24 +598,6 @@ void HWAddressSanitizer::initializeModule() { LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); TargetTriple = Triple(M.getTargetTriple()); - for (auto &F : M.functions()) { - // Remove memory attributes that are invalid with HWASan. - // HWASan checks read from shadow, which invalidates memory(argmem: *) - // Short granule checks on function arguments read from the argument memory - // (last byte of the granule), which invalidates writeonly. - // - // This is not only true for sanitized functions, because AttrInfer can - // infer those attributes on libc functions, which is not true if those - // are instrumented (Android) or intercepted. - - // nobuiltin makes sure later passes don't restore assumptions about - // the function. - F.addFnAttr(llvm::Attribute::NoBuiltin); - F.removeFnAttr(llvm::Attribute::Memory); - for (auto &A : F.args()) - A.removeAttr(llvm::Attribute::WriteOnly); - } - // x86_64 currently has two modes: // - Intel LAM (default) // - pointer aliasing (heap only) @@ -1640,6 +1622,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, assert(!ShadowBase); + // Remove memory attributes that are about to become invalid. + // HWASan checks read from shadow, which invalidates memory(argmem: *) + // Short granule checks on function arguments read from the argument memory + // (last byte of the granule), which invalidates writeonly. + F.removeFnAttr(llvm::Attribute::Memory); + for (auto &A : F.args()) + A.removeAttr(llvm::Attribute::WriteOnly); + BasicBlock::iterator InsertPt = F.getEntryBlock().begin(); IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt); emitPrologue(EntryIRB, diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll index 032168e28421b9..23b1043c700165 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll @@ -33,7 +33,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca -; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca -; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; entry: %x = alloca i32, align 4 @@ -131,17 +131,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -149,16 +147,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; DYNAMIC-SHADOW: [[META10]] = !{null} -; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; DYNAMIC-SHADOW: [[META9]] = !{null} +; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -166,14 +163,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META10]] = !{null} -; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META9]] = !{null} +; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll index dc2d11cb4b3538..9cebe2e845f772 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll @@ -9,6 +9,8 @@ ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW +; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor] +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux" @@ -30,7 +32,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; CHECK-NEXT: br label [[TMP13]] @@ -66,7 +68,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP13]] @@ -86,7 +88,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -106,10 +108,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -118,13 +120,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -143,7 +145,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -163,10 +165,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -175,13 +177,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -210,7 +212,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; CHECK-NEXT: br label [[TMP13]] @@ -246,7 +248,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP13]] @@ -266,7 +268,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -286,10 +288,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -298,13 +300,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -323,7 +325,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -343,10 +345,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -355,13 +357,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -390,7 +392,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; CHECK-NEXT: br label [[TMP13]] @@ -426,7 +428,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP13]] @@ -446,7 +448,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -466,10 +468,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +480,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -503,7 +505,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -523,10 +525,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -535,13 +537,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +572,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; CHECK-NEXT: br label [[TMP13]] @@ -606,7 +608,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP13]] @@ -626,7 +628,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -646,10 +648,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -658,13 +660,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -683,7 +685,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -703,10 +705,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -715,13 +717,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -750,7 +752,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; CHECK-NEXT: br label [[TMP13]] @@ -786,7 +788,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP13]] @@ -806,7 +808,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -826,10 +828,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -838,13 +840,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -863,7 +865,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -883,10 +885,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -895,13 +897,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1011,7 +1013,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; CHECK-NEXT: br label [[TMP13]] @@ -1047,7 +1049,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1067,7 +1069,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1087,10 +1089,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1099,13 +1101,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1124,7 +1126,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1144,10 +1146,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1156,13 +1158,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1191,7 +1193,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; CHECK-NEXT: br label [[TMP13]] @@ -1227,7 +1229,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1247,7 +1249,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1267,10 +1269,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1279,13 +1281,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1304,7 +1306,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1324,10 +1326,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1336,13 +1338,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1371,7 +1373,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; CHECK-NEXT: br label [[TMP13]] @@ -1407,7 +1409,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1427,7 +1429,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1447,10 +1449,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1459,13 +1461,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1484,7 +1486,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1504,10 +1506,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1516,13 +1518,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1551,7 +1553,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; CHECK-NEXT: br label [[TMP13]] @@ -1587,7 +1589,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1607,7 +1609,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1627,10 +1629,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1639,13 +1641,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1664,7 +1666,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1684,10 +1686,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1696,13 +1698,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1731,7 +1733,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; CHECK-NEXT: br label [[TMP13]] @@ -1767,7 +1769,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1787,7 +1789,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1807,10 +1809,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1819,13 +1821,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1844,7 +1846,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1864,10 +1866,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1876,13 +1878,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -2058,43 +2060,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -2106,43 +2108,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll index 0f74736dc232ea..4bd23ea76c159b 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll @@ -34,7 +34,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; entry: %x = alloca i32, align 4 @@ -112,13 +112,13 @@ entry: define void @test_vscale_alloca() sanitize_hwaddress { ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] { ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; DYNAMIC-SHADOW-NEXT: ret void ; ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] { ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; ZERO-BASED-SHADOW-NEXT: ret void @@ -150,17 +150,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -168,16 +166,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; DYNAMIC-SHADOW: [[META10]] = !{null} -; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; DYNAMIC-SHADOW: [[META9]] = !{null} +; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -185,14 +182,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META10]] = !{null} -; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META9]] = !{null} +; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll deleted file mode 100644 index eeb51aeda1000b..00000000000000 --- a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll +++ /dev/null @@ -1,14 +0,0 @@ -; Standard library functions get inferred attributes, some of which are not -; correct when building for HWASan. - -; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64--linux-android10000" - -declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0 - -attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" } - -; CHECK-NOT: memory(argmem: write) -; CHECK: nobuiltin diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll index 1e74f2891a2e3c..4212293f42545e 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll @@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP9]] @@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP9]] @@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP9]] @@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP9]] @@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP9]] @@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP9]] @@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP9]] @@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1542,43 +1542,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -1590,43 +1590,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll index f72fc0a9720e4a..980189c5607f31 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll @@ -194,7 +194,7 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] @@ -206,7 +206,7 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll index 2635dfb75ed98f..00614b603fe799 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll @@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: @__hwasan_shadow = external global [0 x i8] ;. define i8 @test_load8(ptr %a) sanitize_hwaddress { -; CHECK: Function Attrs: nobuiltin sanitize_hwaddress +; CHECK: Function Attrs: sanitize_hwaddress ; CHECK-LABEL: define i8 @test_load8 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -33,7 +33,7 @@ entry: ret i8 %b } ;. -; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress } +; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ;. ; CHECK: [[META0]] = !{ptr @hwasan.note} diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll index 919eacb2951f5e..c0e370f20213aa 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll @@ -11,5 +11,5 @@ entry: ret void } -; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable } +; CHECK: attributes #0 = { sanitize_hwaddress uwtable } attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable } From c4906588ce47de33d59bcd95f3e82ce2c3e61c23 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 29 Aug 2024 21:19:59 +0100 Subject: [PATCH 51/72] [VPlan] Use skipCostComputation when pre-computing induction costs. This ensures we skip any instructions identified to be ignored by the legacy cost model as well. Fixes a divergence between legacy and VPlan-based cost model. Fixes https://github.com/llvm/llvm-project/issues/106417. --- .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- .../LoopVectorize/RISCV/induction-costs.ll | 192 ++++++++++++++++++ 2 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4cc75e2e754603..6babfd1eee9108 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7113,7 +7113,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, IVInsts.push_back(CI); } for (Instruction *IVInst : IVInsts) { - if (!CostCtx.SkipCostComputation.insert(IVInst).second) + if (CostCtx.skipCostComputation(IVInst, VF.isVector())) continue; InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); LLVM_DEBUG({ @@ -7121,6 +7121,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, << ": induction instruction " << *IVInst << "\n"; }); Cost += InductionCost; + CostCtx.SkipCostComputation.insert(IVInst); } } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll new file mode 100644 index 00000000000000..bee7bb7bd61622 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll @@ -0,0 +1,192 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Test case for https://github.com/llvm/llvm-project/issues/106417. +define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { +; CHECK-LABEL: define void @skip_free_iv_truncate( +; CHECK-SAME: i16 [[X:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[X_I32:%.*]] = sext i16 [[X]] to i32 +; CHECK-NEXT: [[X_I64:%.*]] = sext i16 [[X]] to i64 +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 -8 +; CHECK-NEXT: [[SMAX20:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99) +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[SMAX20]], [[X_I64]] +; CHECK-NEXT: [[UMIN21:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 1) +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[SMAX20]], [[UMIN21]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[X_I64]] +; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UMIN21]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 288, i64 [[TMP7]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP8]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99) +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[SMAX]], [[X_I64]] +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP9]], i64 1) +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[SMAX]], [[UMIN]] +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[X_I64]] +; CHECK-NEXT: [[TMP12:%.*]] = udiv i64 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[UMIN]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nsw i64 [[X_I64]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 6, i64 [[TMP13]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult ptr [[TMP16]], [[SCEVGEP]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[X_I64]], 3 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]] +; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 24, i64 [[TMP13]]) +; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 0, [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp ult ptr [[TMP21]], [[SCEVGEP1]] +; CHECK-NEXT: [[TMP23:%.*]] = or i1 [[TMP22]], [[MUL_OVERFLOW4]] +; CHECK-NEXT: [[TMP24:%.*]] = add nsw i64 [[TMP19]], -8 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP24]] +; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 24, i64 [[TMP13]]) +; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = sub i64 0, [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ult ptr [[TMP26]], [[SCEVGEP5]] +; CHECK-NEXT: [[TMP28:%.*]] = or i1 [[TMP27]], [[MUL_OVERFLOW8]] +; CHECK-NEXT: [[TMP29:%.*]] = or i1 [[TMP18]], [[TMP23]] +; CHECK-NEXT: [[TMP30:%.*]] = or i1 [[TMP29]], [[TMP28]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP31:%.*]] = shl nsw i64 [[X_I64]], 1 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[SMAX10:%.*]] = call i64 @llvm.smax.i64(i64 [[X_I64]], i64 99) +; CHECK-NEXT: [[TMP32:%.*]] = sub i64 [[SMAX10]], [[X_I64]] +; CHECK-NEXT: [[UMIN11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP32]], i64 1) +; CHECK-NEXT: [[TMP33:%.*]] = sub i64 [[SMAX10]], [[UMIN11]] +; CHECK-NEXT: [[TMP34:%.*]] = sub i64 [[TMP33]], [[X_I64]] +; CHECK-NEXT: [[TMP35:%.*]] = udiv i64 [[TMP34]], 3 +; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[UMIN11]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 6 +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[TMP37]], [[TMP31]] +; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 2 +; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = shl nsw i64 [[X_I64]], 3 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP36]], 24 +; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[TMP42]], 8 +; CHECK-NEXT: [[SCEVGEP14:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP44:%.*]] = add nsw i64 [[TMP40]], -8 +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP44]] +; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP42]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP9]], [[SCEVGEP14]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP13]], [[SCEVGEP12]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND017:%.*]] = icmp ult ptr [[SCEVGEP9]], [[SCEVGEP16]] +; CHECK-NEXT: [[BOUND118:%.*]] = icmp ult ptr [[SCEVGEP15]], [[SCEVGEP12]] +; CHECK-NEXT: [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP46]] +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[N_VEC]], 3 +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3 +; CHECK-NEXT: [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[X_I64]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP53:%.*]] = call @llvm.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP54:%.*]] = add [[TMP53]], zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = mul [[TMP54]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP55]] +; CHECK-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 8 +; CHECK-NEXT: [[TMP58:%.*]] = mul i64 3, [[TMP57]] +; CHECK-NEXT: [[DOTSPLATINSERT24:%.*]] = insertelement poison, i64 [[TMP58]], i64 0 +; CHECK-NEXT: [[DOTSPLAT25:%.*]] = shufflevector [[DOTSPLATINSERT24]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i16, ptr [[A]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP59]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP52]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT25]] +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[ENTRY]] ], [ [[X_I64]], %[[VECTOR_SCEVCHECK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL23:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[ENTRY]] ], [ [[X_I32]], %[[VECTOR_SCEVCHECK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL23]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8 +; CHECK-NEXT: [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64 +; CHECK-NEXT: [[GEP_CONV:%.*]] = getelementptr i64, ptr [[INVARIANT_GEP]], i64 [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = load i64, ptr [[GEP_CONV]], align 8 +; CHECK-NEXT: [[GEP_I16:%.*]] = getelementptr i16, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i16 0, ptr [[GEP_I16]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 +; CHECK-NEXT: [[TMP64]] = trunc i64 [[IV_NEXT]] to i32 +; CHECK-NEXT: [[C:%.*]] = icmp slt i64 [[IV]], 99 +; CHECK-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %x.i32 = sext i16 %x to i32 + %x.i64 = sext i16 %x to i64 + %invariant.gep = getelementptr i8, ptr %A, i64 -8 + br label %loop + +loop: + %iv = phi i64 [ %x.i64, %entry ], [ %iv.next, %loop ] + %iv.conv = phi i32 [ %x.i32, %entry ], [ %5, %loop ] + %gep.i64 = getelementptr i64, ptr %A, i64 %iv + %2 = load i64, ptr %gep.i64, align 8 + %3 = sext i32 %iv.conv to i64 + %gep.conv = getelementptr i64, ptr %invariant.gep, i64 %3 + %4 = load i64, ptr %gep.conv, align 8 + %gep.i16 = getelementptr i16, ptr %A, i64 %iv + store i16 0, ptr %gep.i16, align 2 + %iv.next = add i64 %iv, 3 + %5 = trunc i64 %iv.next to i32 + %c = icmp slt i64 %iv, 99 + br i1 %c, label %loop, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} +; CHECK: [[META5]] = distinct !{[[META5]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]} +;. From 1f0d545ec38ceaafa7ca94aa659be125bdcd721f Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 29 Aug 2024 16:37:25 -0400 Subject: [PATCH 52/72] [libc++] Fix wraparound issue with -fsanitize=integer in string operator>> (#106263) Fixes #106261 rdar://133991190 --- libcxx/include/__type_traits/make_unsigned.h | 4 +--- libcxx/include/istream | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h index 282cd2d9113166..8757f451eb807b 100644 --- a/libcxx/include/__type_traits/make_unsigned.h +++ b/libcxx/include/__type_traits/make_unsigned.h @@ -86,12 +86,10 @@ template using make_unsigned_t = __make_unsigned_t<_Tp>; #endif -#ifndef _LIBCPP_CXX03_LANG template -_LIBCPP_HIDE_FROM_ABI constexpr __make_unsigned_t<_Tp> __to_unsigned_like(_Tp __x) noexcept { +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __make_unsigned_t<_Tp> __to_unsigned_like(_Tp __x) _NOEXCEPT { return static_cast<__make_unsigned_t<_Tp> >(__x); } -#endif template using __copy_unsigned_t = __conditional_t::value, __make_unsigned_t<_Up>, _Up>; diff --git a/libcxx/include/istream b/libcxx/include/istream index d2b577a9ad9efc..7c65a24bc313d9 100644 --- a/libcxx/include/istream +++ b/libcxx/include/istream @@ -165,6 +165,7 @@ template #include <__type_traits/conjunction.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_base_of.h> +#include <__type_traits/make_unsigned.h> #include <__utility/declval.h> #include <__utility/forward.h> #include @@ -1211,12 +1212,17 @@ operator>>(basic_istream<_CharT, _Traits>& __is, basic_string<_CharT, _Traits, _ try { #endif __str.clear(); - streamsize __n = __is.width(); - if (__n <= 0) - __n = __str.max_size(); - if (__n <= 0) - __n = numeric_limits::max(); - streamsize __c = 0; + using _Size = typename basic_string<_CharT, _Traits, _Allocator>::size_type; + streamsize const __width = __is.width(); + _Size const __max_size = __str.max_size(); + _Size __n; + if (__width <= 0) { + __n = __max_size; + } else { + __n = std::__to_unsigned_like(__width) < __max_size ? static_cast<_Size>(__width) : __max_size; + } + + _Size __c = 0; const ctype<_CharT>& __ct = std::use_facet >(__is.getloc()); while (__c < __n) { typename _Traits::int_type __i = __is.rdbuf()->sgetc(); From 049b60c5bb7e774b74772c6b89c72593f73a89b0 Mon Sep 17 00:00:00 2001 From: Tom Honermann Date: Thu, 29 Aug 2024 17:00:19 -0400 Subject: [PATCH 53/72] [NFC][Clang] Avoid potential null pointer dereferences in Sema::AddInitializerToDecl(). (#106235) Control flow analysis performed by a static analysis tool revealed the potential for null pointer dereferences to occur in conjunction with the `Init` parameter in `Sema::AddInitializerToDecl()`. On entry to the function, `Init` is required to be non-null as there are multiple potential branches that unconditionally dereference it. However, there were two places where `Init` is compared to null thus implying that `Init` is expected to be null in some cases. These checks appear to be purely defensive checks and thus unnecessary. Further, there were several cases where code checked `Result`, a variable of type `ExprResult`, for an invalid value, but did not check for a valid but null value and then proceeded to unconditionally dereference the potential null result. This change elides the unnecessary defensive checks and changes some checks for an invalid result to instead branch on an unusable result (either an invalid result or a valid but null result). --- clang/lib/Sema/SemaDecl.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 4efa80778e71b9..6327ae9b99aa4c 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -13319,8 +13319,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { } // WebAssembly tables can't be used to initialise a variable. - if (Init && !Init->getType().isNull() && - Init->getType()->isWebAssemblyTableType()) { + if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) { Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0; VDecl->setInvalidDecl(); return; @@ -13463,7 +13462,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { if (getLangOpts().DebuggerCastResultToId && DclT->isObjCObjectPointerType() && Init->getType() == Context.UnknownAnyTy) { ExprResult Result = forceUnknownAnyToType(Init, Context.getObjCIdType()); - if (Result.isInvalid()) { + if (!Result.isUsable()) { VDecl->setInvalidDecl(); return; } @@ -13491,7 +13490,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { InitializationSequence Init(*this, Entity, Kind, MultiExprArg(E)); return Init.Failed() ? ExprError() : E; }); - if (Res.isInvalid()) { + if (!Res.isUsable()) { VDecl->setInvalidDecl(); } else if (Res.get() != Args[Idx]) { Args[Idx] = Res.get(); @@ -13504,7 +13503,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { /*TopLevelOfInitList=*/false, /*TreatUnavailableAsInvalid=*/false); ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Args, &DclT); - if (Result.isInvalid()) { + if (!Result.isUsable()) { // If the provided initializer fails to initialize the var decl, // we attach a recovery expr for better recovery. auto RecoveryExpr = @@ -13528,8 +13527,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { InitSeq.step_begin()->Kind == InitializationSequence::SK_ParenthesizedListInit; QualType VDeclType = VDecl->getType(); - if (Init && !Init->getType().isNull() && - !Init->getType()->isDependentType() && !VDeclType->isDependentType() && + if (!Init->getType().isNull() && !Init->getType()->isDependentType() && + !VDeclType->isDependentType() && Context.getAsIncompleteArrayType(VDeclType) && Context.getAsIncompleteArrayType(Init->getType())) { // Bail out if it is not possible to deduce array size from the @@ -13592,7 +13591,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { ExprResult Result = ActOnFinishFullExpr(Init, VDecl->getLocation(), /*DiscardedValue*/ false, VDecl->isConstexpr()); - if (Result.isInvalid()) { + if (!Result.isUsable()) { VDecl->setInvalidDecl(); return; } From 593526f3fb138069fc93b14d08320d0e3f67c707 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Aug 2024 14:02:26 -0700 Subject: [PATCH 54/72] [X86] Use MCRegister instead of int64_t in X86MCExpr. NFC (#106569) --- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 6 +++--- llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 864b7d8e769ab1..2b6b0ad16bcf76 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2789,7 +2789,7 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) { if (auto *RE = dyn_cast(Expr)) { // Segment Register. Reset Expr and copy value to register. Expr = nullptr; - Reg = RE->getRegNo(); + Reg = RE->getReg(); // Check the register. if (Reg == X86::EIZ || Reg == X86::RIZ) @@ -3052,7 +3052,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, return true; // Check the register. - BaseReg = cast(E)->getRegNo(); + BaseReg = cast(E)->getReg(); if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) return Error(BaseLoc, "eiz and riz can only be used as index registers", SMRange(BaseLoc, EndLoc)); @@ -3079,7 +3079,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, Warning(Loc, "scale factor without index register is ignored"); Scale = 1; } else { // IndexReg Found. - IndexReg = cast(E)->getRegNo(); + IndexReg = cast(E)->getReg(); if (BaseReg == X86::RIP) return Error(Loc, diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h index c159d30194cc64..37e15193b04357 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h @@ -26,16 +26,16 @@ namespace llvm { class X86MCExpr : public MCTargetExpr { private: - const int64_t RegNo; // All + const MCRegister Reg; // All - explicit X86MCExpr(int64_t R) : RegNo(R) {} + explicit X86MCExpr(MCRegister R) : Reg(R) {} public: /// @name Construction /// @{ - static const X86MCExpr *create(int64_t RegNo, MCContext &Ctx) { - return new (Ctx) X86MCExpr(RegNo); + static const X86MCExpr *create(MCRegister Reg, MCContext &Ctx) { + return new (Ctx) X86MCExpr(Reg); } /// @} @@ -43,14 +43,14 @@ class X86MCExpr : public MCTargetExpr { /// @{ /// getSubExpr - Get the child of this expression. - int64_t getRegNo() const { return RegNo; } + MCRegister getReg() const { return Reg; } /// @} void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override { if (!MAI || MAI->getAssemblerDialect() == 0) OS << '%'; - OS << X86ATTInstPrinter::getRegisterName(RegNo); + OS << X86ATTInstPrinter::getRegisterName(Reg); } bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, @@ -61,7 +61,7 @@ class X86MCExpr : public MCTargetExpr { bool inlineAssignedExpr() const override { return true; } bool isEqualTo(const MCExpr *X) const override { if (auto *E = dyn_cast(X)) - return getRegNo() == E->getRegNo(); + return getReg() == E->getReg(); return false; } void visitUsedExpr(MCStreamer &Streamer) const override {} From 4ca817d0511b2c36b2f5d242e0c8f90a7a9c4f14 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Aug 2024 14:02:53 -0700 Subject: [PATCH 55/72] [GlobalISel] Add bail outs for scalable vectors to some combines. (#106496) These combines call getNumElements() which isn't valid for scalable vectors. --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 9 +++++ .../GlobalISel/scalablevec-combiner-crash.ll | 35 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 1517ae707c8cff..df9c12bc9c97bd 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2678,6 +2678,9 @@ bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) { MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) && "Expected an insert/extract element op"); LLT VecTy = MRI.getType(MI.getOperand(1).getReg()); + if (VecTy.isScalableVector()) + return false; + unsigned IdxIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3; auto Idx = getIConstantVRegVal(MI.getOperand(IdxIdx).getReg(), MRI); @@ -2961,6 +2964,10 @@ bool CombinerHelper::matchCombineInsertVecElts( Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); assert(DstTy.isVector() && "Invalid G_INSERT_VECTOR_ELT?"); + + if (DstTy.isScalableVector()) + return false; + unsigned NumElts = DstTy.getNumElements(); // If this MI is part of a sequence of insert_vec_elts, then // don't do the combine in the middle of the sequence. @@ -4046,6 +4053,8 @@ bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI, // and find the source register that the index maps to. Register SrcVec = MI.getOperand(1).getReg(); LLT SrcTy = MRI.getType(SrcVec); + if (SrcTy.isScalableVector()) + return false; auto Cst = getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); if (!Cst || Cst->Value.getZExtValue() >= SrcTy.getNumElements()) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll new file mode 100644 index 00000000000000..8ce4b334a5134c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/scalablevec-combiner-crash.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -global-isel -stop-after=riscv-prelegalizer-combiner | FileCheck %s + +; Make sure we don't crash in the prelegalizer combiner for scalable vector +; insert and extracts. + +define @insertelement_nxv1i1_0( %x) { + ; CHECK-LABEL: name: insertelement_nxv1i1_0 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_() = G_INSERT_VECTOR_ELT [[COPY]], [[C]](s1), [[C1]](s64) + ; CHECK-NEXT: $v0 = COPY [[IVEC]]() + ; CHECK-NEXT: PseudoRET implicit $v0 + %a = insertelement %x, i1 0, i32 0 + ret %a +} + +define @shufflevector_nxv1i1_0( %x) { + ; CHECK-LABEL: name: shufflevector_nxv1i1_0 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s1) = G_EXTRACT_VECTOR_ELT [[COPY]](), [[C]](s64) + ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[EVEC]](s1) + ; CHECK-NEXT: $v0 = COPY [[SPLAT_VECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v0 + %a = shufflevector %x, poison, poison + ret %a +} From 182708680bbe34b579a09b2dbc3215b519b2473f Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Thu, 29 Aug 2024 14:25:34 -0700 Subject: [PATCH 56/72] [SandboxIR] Add ExtractValueInst. (#106613) --- llvm/include/llvm/SandboxIR/SandboxIR.h | 66 ++++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 27 +++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 100 ++++++++++++++++++ 4 files changed, 194 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 14210ae35c0082..d4c2efca4ecfa0 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -74,6 +74,8 @@ // | // +- ShuffleVectorInst // | +// +- ExtractValueInst +// | // +- InsertValueInst // | // +- StoreInst @@ -120,6 +122,7 @@ class SelectInst; class ExtractElementInst; class InsertElementInst; class ShuffleVectorInst; +class ExtractValueInst; class InsertValueInst; class BranchInst; class UnaryInstruction; @@ -270,6 +273,7 @@ class Value { friend class ExtractElementInst; // For getting `Val`. friend class InsertElementInst; // For getting `Val`. friend class ShuffleVectorInst; // For getting `Val`. + friend class ExtractValueInst; // For getting `Val`. friend class InsertValueInst; // For getting `Val`. friend class BranchInst; // For getting `Val`. friend class LoadInst; // For getting `Val`. @@ -710,6 +714,7 @@ class Instruction : public sandboxir::User { friend class ExtractElementInst; // For getTopmostLLVMInstruction(). friend class InsertElementInst; // For getTopmostLLVMInstruction(). friend class ShuffleVectorInst; // For getTopmostLLVMInstruction(). + friend class ExtractValueInst; // For getTopmostLLVMInstruction(). friend class InsertValueInst; // For getTopmostLLVMInstruction(). friend class BranchInst; // For getTopmostLLVMInstruction(). friend class LoadInst; // For getTopmostLLVMInstruction(). @@ -1621,6 +1626,65 @@ class UnaryInstruction } }; +class ExtractValueInst : public UnaryInstruction { + /// Use Context::createExtractValueInst() instead. + ExtractValueInst(llvm::ExtractValueInst *EVI, Context &Ctx) + : UnaryInstruction(ClassID::ExtractValue, Opcode::ExtractValue, EVI, + Ctx) {} + friend Context; // for ExtractValueInst() + +public: + static Value *create(Value *Agg, ArrayRef Idxs, BBIterator WhereIt, + BasicBlock *WhereBB, Context &Ctx, + const Twine &Name = ""); + + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ExtractValue; + } + + /// Returns the type of the element that would be extracted + /// with an extractvalue instruction with the specified parameters. + /// + /// Null is returned if the indices are invalid for the specified type. + static Type *getIndexedType(Type *Agg, ArrayRef Idxs) { + return llvm::ExtractValueInst::getIndexedType(Agg, Idxs); + } + + using idx_iterator = llvm::ExtractValueInst::idx_iterator; + + inline idx_iterator idx_begin() const { + return cast(Val)->idx_begin(); + } + inline idx_iterator idx_end() const { + return cast(Val)->idx_end(); + } + inline iterator_range indices() const { + return cast(Val)->indices(); + } + + Value *getAggregateOperand() { + return getOperand(getAggregateOperandIndex()); + } + const Value *getAggregateOperand() const { + return getOperand(getAggregateOperandIndex()); + } + static unsigned getAggregateOperandIndex() { + return llvm::ExtractValueInst::getAggregateOperandIndex(); + } + + ArrayRef getIndices() const { + return cast(Val)->getIndices(); + } + + unsigned getNumIndices() const { + return cast(Val)->getNumIndices(); + } + + unsigned hasIndices() const { + return cast(Val)->hasIndices(); + } +}; + class VAArgInst : public UnaryInstruction { VAArgInst(llvm::VAArgInst *FI, Context &Ctx) : UnaryInstruction(ClassID::VAArg, Opcode::VAArg, FI, Ctx) {} @@ -3123,6 +3187,8 @@ class Context { friend ExtractElementInst; // For createExtractElementInst() ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); friend ShuffleVectorInst; // For createShuffleVectorInst() + ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI); + friend ExtractValueInst; // For createExtractValueInst() InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI); friend InsertValueInst; // For createInsertValueInst() BranchInst *createBranchInst(llvm::BranchInst *I); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 49b03ad6760d37..d29fc3b5e95871 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -47,6 +47,7 @@ DEF_INSTR(VAArg, OP(VAArg), VAArgInst) DEF_INSTR(Freeze, OP(Freeze), FreezeInst) DEF_INSTR(Fence, OP(Fence), FenceInst) DEF_INSTR(ShuffleVector, OP(ShuffleVector), ShuffleVectorInst) +DEF_INSTR(ExtractValue, OP(ExtractValue), ExtractValueInst) DEF_INSTR(InsertValue, OP(InsertValue), InsertValueInst) DEF_INSTR(Select, OP(Select), SelectInst) DEF_INSTR(Br, OP(Br), BranchInst) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 86198b7b83d0e2..b5786cdafd6307 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2145,6 +2145,21 @@ Constant *ShuffleVectorInst::convertShuffleMaskForBitcode( llvm::ShuffleVectorInst::convertShuffleMaskForBitcode(Mask, ResultTy)); } +Value *ExtractValueInst::create(Value *Agg, ArrayRef Idxs, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx, const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + if (WhereIt != WhereBB->end()) + Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction()); + else + Builder.SetInsertPoint(cast(WhereBB->Val)); + llvm::Value *NewV = Builder.CreateExtractValue(Agg->Val, Idxs, Name); + if (auto *NewExtractValueInst = dyn_cast(NewV)) + return Ctx.createExtractValueInst(NewExtractValueInst); + assert(isa(NewV) && "Expected constant"); + return Ctx.getOrCreateConstant(cast(NewV)); +} + Value *InsertValueInst::create(Value *Agg, Value *Val, ArrayRef Idxs, BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx, const Twine &Name) { @@ -2320,6 +2335,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { new ShuffleVectorInst(LLVMIns, *this)); return It->second.get(); } + case llvm::Instruction::ExtractValue: { + auto *LLVMIns = cast(LLVMV); + It->second = + std::unique_ptr(new ExtractValueInst(LLVMIns, *this)); + return It->second.get(); + } case llvm::Instruction::InsertValue: { auto *LLVMIns = cast(LLVMV); It->second = @@ -2548,6 +2569,12 @@ Context::createShuffleVectorInst(llvm::ShuffleVectorInst *SVI) { return cast(registerValue(std::move(NewPtr))); } +ExtractValueInst *Context::createExtractValueInst(llvm::ExtractValueInst *EVI) { + auto NewPtr = + std::unique_ptr(new ExtractValueInst(EVI, *this)); + return cast(registerValue(std::move(NewPtr))); +} + InsertValueInst *Context::createInsertValueInst(llvm::InsertValueInst *IVI) { auto NewPtr = std::unique_ptr(new InsertValueInst(IVI, *this)); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 31519074e1b908..8bf4b24c48ee0d 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1261,6 +1261,106 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { } } +TEST_F(SandboxIRTest, ExtractValueInst) { + parseIR(C, R"IR( +define void @foo({i32, float} %agg) { + %ext_simple = extractvalue {i32, float} %agg, 0 + %ext_nested = extractvalue {float, {i32}} undef, 1, 0 + %const1 = extractvalue {i32, float} {i32 0, float 99.0}, 0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *ArgAgg = F.getArg(0); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *ExtSimple = cast(&*It++); + auto *ExtNested = cast(&*It++); + auto *Const1 = cast(&*It++); + auto *Ret = &*It++; + + EXPECT_EQ(ExtSimple->getOperand(0), ArgAgg); + + // create before instruction + auto *NewExtBeforeRet = + cast(sandboxir::ExtractValueInst::create( + ArgAgg, ArrayRef({0}), Ret->getIterator(), Ret->getParent(), + Ctx, "NewExtBeforeRet")); + EXPECT_EQ(NewExtBeforeRet->getNextNode(), Ret); +#ifndef NDEBUG + EXPECT_EQ(NewExtBeforeRet->getName(), "NewExtBeforeRet"); +#endif // NDEBUG + + // create at end of BB + auto *NewExtAtEnd = + cast(sandboxir::ExtractValueInst::create( + ArgAgg, ArrayRef({0}), BB->end(), BB, Ctx, "NewExtAtEnd")); + EXPECT_EQ(NewExtAtEnd->getPrevNode(), Ret); +#ifndef NDEBUG + EXPECT_EQ(NewExtAtEnd->getName(), "NewExtAtEnd"); +#endif // NDEBUG + + // Test the path that creates a folded constant. + auto *ShouldBeConstant = sandboxir::ExtractValueInst::create( + Const1->getOperand(0), ArrayRef({0}), BB->end(), BB, Ctx); + EXPECT_TRUE(isa(ShouldBeConstant)); + + auto *Zero = sandboxir::ConstantInt::get(Type::getInt32Ty(C), 0, Ctx); + EXPECT_EQ(ShouldBeConstant, Zero); + + // getIndexedType + Type *AggType = ExtNested->getAggregateOperand()->getType(); + EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType( + AggType, ArrayRef({1, 0})), + llvm::ExtractValueInst::getIndexedType(AggType, + ArrayRef({1, 0}))); + + EXPECT_EQ(sandboxir::ExtractValueInst::getIndexedType( + AggType, ArrayRef({2})), + nullptr); + + // idx_begin / idx_end + { + SmallVector IndicesSimple(ExtSimple->idx_begin(), + ExtSimple->idx_end()); + EXPECT_THAT(IndicesSimple, testing::ElementsAre(0u)); + + SmallVector IndicesNested(ExtNested->idx_begin(), + ExtNested->idx_end()); + EXPECT_THAT(IndicesNested, testing::ElementsAre(1u, 0u)); + } + + // indices + { + SmallVector IndicesSimple(ExtSimple->indices()); + EXPECT_THAT(IndicesSimple, testing::ElementsAre(0u)); + + SmallVector IndicesNested(ExtNested->indices()); + EXPECT_THAT(IndicesNested, testing::ElementsAre(1u, 0u)); + } + + // getAggregateOperand + EXPECT_EQ(ExtSimple->getAggregateOperand(), ArgAgg); + const auto *ConstExtSimple = ExtSimple; + EXPECT_EQ(ConstExtSimple->getAggregateOperand(), ArgAgg); + + // getAggregateOperandIndex + EXPECT_EQ(sandboxir::ExtractValueInst::getAggregateOperandIndex(), + llvm::ExtractValueInst::getAggregateOperandIndex()); + + // getIndices + EXPECT_EQ(ExtSimple->getIndices().size(), 1u); + EXPECT_EQ(ExtSimple->getIndices()[0], 0u); + + // getNumIndices + EXPECT_EQ(ExtSimple->getNumIndices(), 1u); + + // hasIndices + EXPECT_EQ(ExtSimple->hasIndices(), true); +} + TEST_F(SandboxIRTest, InsertValueInst) { parseIR(C, R"IR( define void @foo({i32, float} %agg, i32 %i) { From 412e3e394dbd1b7d8655639e161ed4dbd5505c96 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 29 Aug 2024 22:09:19 +0100 Subject: [PATCH 57/72] [ExtendLifetimes][NFC] Add explicit triple to remaining fake-use tests One of the tests for the new fake use intrinsic are failing on darwin buildbots due to relying on behaviour for their expected triple; this commit adds explicit triples to the few remaining fake-use tests that didn't have them. Fixes commit 3d08ade (#86149). Buildbot failures: https://lab.llvm.org/buildbot/#/builders/23/builds/2505 --- llvm/test/CodeGen/X86/fake-use-hpfloat.ll | 2 +- llvm/test/CodeGen/X86/fake-use-vector.ll | 2 +- llvm/test/DebugInfo/X86/fake-use.ll | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll index 7a95c38801837c..fd511a6179acfe 100644 --- a/llvm/test/CodeGen/X86/fake-use-hpfloat.ll +++ b/llvm/test/CodeGen/X86/fake-use-hpfloat.ll @@ -1,6 +1,6 @@ ; assert in DAGlegalizer with fake use of half precision float. ; Changes to half float promotion. -; RUN: llc -stop-after=finalize-isel -o - %s | FileCheck %s +; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s ; ; CHECK: bb.0.entry: ; CHECK-NEXT: %0:fr16 = FsFLD0SH diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll index cb46ccc8cac11c..4d6ede30827046 100644 --- a/llvm/test/CodeGen/X86/fake-use-vector.ll +++ b/llvm/test/CodeGen/X86/fake-use-vector.ll @@ -1,5 +1,5 @@ ; assert in DAGlegalizer with fake use of 1-element vectors. -; RUN: llc -stop-after=finalize-isel -filetype=asm -o - %s | FileCheck %s +; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s ; ; ModuleID = 't2.cpp' ; source_filename = "t2.cpp" diff --git a/llvm/test/DebugInfo/X86/fake-use.ll b/llvm/test/DebugInfo/X86/fake-use.ll index f44aadfeef5640..5ac5104a167118 100644 --- a/llvm/test/DebugInfo/X86/fake-use.ll +++ b/llvm/test/DebugInfo/X86/fake-use.ll @@ -3,9 +3,9 @@ ; Make sure the fake use of 'b' at the end of 'foo' causes location information for 'b' ; to extend all the way to the end of the function. -; RUN: %llc_dwarf -O2 -filetype=obj -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: %llc_dwarf -O2 -filetype=obj -mtriple=x86_64-unknown-linux -dwarf-linkage-names=Abstract < %s | llvm-dwarfdump --debug-info --debug-line -v - -o %t ; RUN: %python %p/../Inputs/check-fake-use.py %t -; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t +; RUN: sed -e 's,call void (...) @llvm.fake.use,;,' %s | %llc_dwarf - -O2 -filetype=obj -mtriple=x86_64-unknown-linux -dwarf-linkage-names=Abstract | llvm-dwarfdump --debug-info --debug-line -v - -o %t ; RUN: not %python %p/../Inputs/check-fake-use.py %t ; Generated with: From 7284e0f3a4f8924a0f69f654db8c4b4d00d232cb Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 29 Aug 2024 18:53:03 -0300 Subject: [PATCH 58/72] [clang] mangle placeholder for deduced type as a template-prefix (#106335) As agreed on https://github.com/itanium-cxx-abi/cxx-abi/issues/109 these placeholders should be mangled as a `template-prefix` production. ``` ::=