diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml index 1a1700b75cfdb7..f2bb37316d3a8b 100644 --- a/.github/workflows/pr-code-format.yml +++ b/.github/workflows/pr-code-format.yml @@ -13,6 +13,9 @@ jobs: code_formatter: runs-on: ubuntu-latest timeout-minutes: 30 + concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: true if: github.repository == 'llvm/llvm-project' steps: - name: Fetch LLVM sources diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml index e2f3eeadd15bea..f08088c7bc56f1 100644 --- a/.github/workflows/release-binaries-save-stage/action.yml +++ b/.github/workflows/release-binaries-save-stage/action.yml @@ -10,6 +10,9 @@ inputs: required: true type: 'string' +permissions: + contents: read + runs: using: "composite" steps: @@ -18,6 +21,9 @@ runs: - name: Package Build and Source Directories shell: bash run: | + # Remove .git/config to avoid leaking GITHUB_TOKEN stored there. + # See https://unit42.paloaltonetworks.com/github-repo-artifacts-leak-tokens/ + rm -Rf .git/config # Windows does not support symlinks, so we need to dereference them. tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst mv ../llvm-project.tar.zst . diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 4ef37a5fad67f5..a4d1ceed8c1c4b 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -382,7 +382,7 @@ foreach(target riscv32-unknown-elf) foreach(lang C;CXX;ASM) # TODO: The preprocessor defines workaround various issues in libc and libc++ integration. # These should be addressed and removed over time. - set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "") + set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "") endforeach() foreach(type SHARED;MODULE;EXE) set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "") diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst index 4782eb3cda754a..e143c5b71575aa 100644 --- a/clang/docs/HLSL/ExpectedDifferences.rst +++ b/clang/docs/HLSL/ExpectedDifferences.rst @@ -54,6 +54,19 @@ HLSL 202x based on proposal and `0008 `_. +The largest difference between Clang and DXC's overload resolution is the +algorithm used for identifying best-match overloads. There are more details +about the algorithmic differences in the :ref:`multi_argument_overloads` section +below. There are three high level differences that should be highlighted: + +* **There should be no cases** where DXC and Clang both successfully + resolve an overload where the resolved overload is different between the two. +* There are cases where Clang will successfully resolve an overload that DXC + wouldn't because we've trimmed the overload set in Clang to remove ambiguity. +* There are cases where DXC will successfully resolve an overload that Clang + will not for two reasons: (1) DXC only generates partial overload sets for + builtin functions and (2) DXC resolves cases that probably should be ambiguous. + Clang's implementation extends standard overload resolution rules to HLSL library functionality. This causes subtle changes in overload resolution behavior between Clang and DXC. Some examples include: @@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include: uint U; int I; float X, Y, Z; - double3 A, B; + double3 R, G; } - void twoParams(int, int); - void twoParams(float, float); + void takesSingleDouble(double); + void takesSingleDouble(vector); + + void scalarOrVector(double); + void scalarOrVector(vector); export void call() { - halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads - // Clang: Resolves to halfOrInt16(uint16_t). - halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t). half H; + halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t). + #ifndef IGNORE_ERRORS + halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t + // overloads + // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t. H = asfloat16(I); // DXC: Fails to resolve overload for int. // Clang: Resolves to asfloat16(int16_t). @@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include: takesDoubles(X, Y, Z); // Works on all compilers #ifndef IGNORE_ERRORS - fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double. + fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to + // double. // Clang: Resolves to fma(double,double,double). - #endif - double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation. + double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation. // FXC: Expands to compute double dot product with fmul/fadd - // Clang: Resolves to dot(float3, float3), emits conversion warnings. + // Clang: Fails to resolve as ambiguous against + // dot(half, half) or dot(float, float) + #endif #ifndef IGNORE_ERRORS tan(B); // DXC: resolves to tan(float). // Clang: Fails to resolve, ambiguous between integer types. - twoParams(I, X); // DXC: resolves twoParams(int, int). - // Clang: Fails to resolve ambiguous conversions. #endif + + double D; + takesSingleDouble(D); // All: Fails to resolve ambiguous conversions. + takesSingleDouble(R); // All: Fails to resolve ambiguous conversions. + + scalarOrVector(D); // All: Resolves to scalarOrVector(double). + scalarOrVector(R); // All: Fails to resolve ambiguous conversions. } .. note:: @@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include: diagnostic notifying the user of the conversion rather than silently altering precision relative to the other overloads (as FXC does) or generating code that will fail validation (as DXC does). + +.. _multi_argument_overloads: + +Multi-Argument Overloads +------------------------ + +In addition to the differences in single-element conversions, Clang and DXC +differ dramatically in multi-argument overload resolution. C++ multi-argument +overload resolution behavior (or something very similar) is required to +implement +`non-member operator overloading `_. + +Clang adopts the C++ inspired language from the +`draft HLSL specification `_, +where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the +conversion sequences is not worse than the corresponding conversion sequence and +for at least one argument it is better. + +.. code-block:: c++ + + cbuffer CB { + int I; + float X; + float4 V; + } + + void twoParams(int, int); + void twoParams(float, float); + void threeParams(float, float, float); + void threeParams(float4, float4, float4); + + export void call() { + twoParams(I, X); // DXC: resolves twoParams(int, int). + // Clang: Fails to resolve ambiguous conversions. + + threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4). + // Clang: Fails to resolve ambiguous conversions. + } + +For the examples above since ``twoParams`` called with mixed parameters produces +implicit conversion sequences that are { ExactMatch, FloatingIntegral } and { +FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion +in the other sequence, so the overload is ambiguous. + +In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation, +VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both +cases at least one parameter has a worse conversion in the other sequence, so +the overload is ambiguous. + +.. note:: + + The behavior of DXC documented below is undocumented so this is gleaned from + observation and a bit of reading the source. + +DXC's approach for determining the best overload produces an integer score value +for each implicit conversion sequence for each argument expression. Scores for +casts are based on a bitmask construction that is complicated to reverse +engineer. It seems that: + +* Exact match is 0 +* Dimension increase is 1 +* Promotion is 2 +* Integral -> Float conversion is 4 +* Float -> Integral conversion is 8 +* Cast is 16 + +The masks are or'd against each other to produce a score for the cast. + +The scores of each conversion sequence are then summed to generate a score for +the overload candidate. The overload candidate with the lowest score is the best +candidate. If more than one overload are matched for the lowest score the call +is ambiguous. diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp index 35ae1547939fdd..b8778f6027894c 100644 --- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp +++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp @@ -21,17 +21,6 @@ using namespace clang; using namespace clang::interp; -/// Unevaluated builtins don't get their arguments put on the stack -/// automatically. They instead operate on the AST of their Call -/// Expression. -/// Similar information is available via ASTContext::BuiltinInfo, -/// but that is not correct for our use cases. -static bool isUnevaluatedBuiltin(unsigned BuiltinID) { - return BuiltinID == Builtin::BI__builtin_classify_type || - BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size || - BuiltinID == Builtin::BI__builtin_constant_p; -} - Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { // Manually created functions that haven't been assigned proper @@ -147,14 +136,11 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { // Create a handle over the emitted code. Function *Func = P.getFunction(FuncDecl); if (!Func) { - bool IsUnevaluatedBuiltin = false; - if (unsigned BI = FuncDecl->getBuiltinID()) - IsUnevaluatedBuiltin = isUnevaluatedBuiltin(BI); - + unsigned BuiltinID = FuncDecl->getBuiltinID(); Func = P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes), std::move(ParamDescriptors), std::move(ParamOffsets), - HasThisPointer, HasRVO, IsUnevaluatedBuiltin); + HasThisPointer, HasRVO, BuiltinID); } assert(Func); diff --git a/clang/lib/AST/ByteCode/Function.cpp b/clang/lib/AST/ByteCode/Function.cpp index e3fab3f6720b41..25da6ae1bc7b61 100644 --- a/clang/lib/AST/ByteCode/Function.cpp +++ b/clang/lib/AST/ByteCode/Function.cpp @@ -20,11 +20,10 @@ Function::Function(Program &P, FunctionDeclTy Source, unsigned ArgSize, llvm::SmallVectorImpl &&ParamTypes, llvm::DenseMap &&Params, llvm::SmallVectorImpl &&ParamOffsets, - bool HasThisPointer, bool HasRVO, bool UnevaluatedBuiltin) + bool HasThisPointer, bool HasRVO, unsigned BuiltinID) : P(P), Source(Source), ArgSize(ArgSize), ParamTypes(std::move(ParamTypes)), Params(std::move(Params)), ParamOffsets(std::move(ParamOffsets)), - HasThisPointer(HasThisPointer), HasRVO(HasRVO), - IsUnevaluatedBuiltin(UnevaluatedBuiltin) { + HasThisPointer(HasThisPointer), HasRVO(HasRVO), BuiltinID(BuiltinID) { if (const auto *F = Source.dyn_cast()) Variadic = F->isVariadic(); } @@ -53,3 +52,18 @@ bool Function::isVirtual() const { return M->isVirtual(); return false; } + +/// Unevaluated builtins don't get their arguments put on the stack +/// automatically. They instead operate on the AST of their Call +/// Expression. +/// Similar information is available via ASTContext::BuiltinInfo, +/// but that is not correct for our use cases. +static bool isUnevaluatedBuiltin(unsigned BuiltinID) { + return BuiltinID == Builtin::BI__builtin_classify_type || + BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size || + BuiltinID == Builtin::BI__builtin_constant_p; +} + +bool Function::isUnevaluatedBuiltin() const { + return ::isUnevaluatedBuiltin(BuiltinID); +} diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h index f254db20d4f594..b21fa8497130ea 100644 --- a/clang/lib/AST/ByteCode/Function.h +++ b/clang/lib/AST/ByteCode/Function.h @@ -193,15 +193,11 @@ class Function final { bool isVariadic() const { return Variadic; } - unsigned getBuiltinID() const { - return Source.get()->getBuiltinID(); - } + unsigned getBuiltinID() const { return BuiltinID; } - bool isBuiltin() const { - return Source.get()->getBuiltinID() != 0; - } + bool isBuiltin() const { return getBuiltinID() != 0; } - bool isUnevaluatedBuiltin() const { return IsUnevaluatedBuiltin; } + bool isUnevaluatedBuiltin() const; unsigned getNumParams() const { return ParamTypes.size(); } @@ -232,7 +228,7 @@ class Function final { llvm::SmallVectorImpl &&ParamTypes, llvm::DenseMap &&Params, llvm::SmallVectorImpl &&ParamOffsets, bool HasThisPointer, - bool HasRVO, bool UnevaluatedBuiltin); + bool HasRVO, unsigned BuiltinID); /// Sets the code of a function. void setCode(unsigned NewFrameSize, std::vector &&NewCode, @@ -289,7 +285,7 @@ class Function final { bool HasBody = false; bool Defined = false; bool Variadic = false; - bool IsUnevaluatedBuiltin = false; + unsigned BuiltinID = 0; public: /// Dumps the disassembled bytecode to \c llvm::errs(). diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 5b518bf6c859e8..246b29d308bfaf 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2682,6 +2682,7 @@ void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) { break; } case tok::at: + case tok::colon: nextToken(); if (FormatTok->is(tok::l_brace)) { nextToken(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 5aa5d93c1cb067..497b911f4efbba 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3286,6 +3286,15 @@ TEST_F(TokenAnnotatorTest, BlockLBrace) { EXPECT_BRACE_KIND(Tokens[4], BK_Block); EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_BlockLBrace); EXPECT_BRACE_KIND(Tokens[5], BK_Block); + + Tokens = annotate("[foo bar:{{0, 1}} baz:baz];", + getLLVMStyle(FormatStyle::LK_ObjC)); + ASSERT_EQ(Tokens.size(), 17u) << Tokens; + EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_Unknown); // Not TT_BlockLBrace. + EXPECT_BRACE_KIND(Tokens[4], BK_Unknown); // Not BK_Block. + EXPECT_BRACE_KIND(Tokens[5], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[9], BK_Unknown); // Not BK_Block. + EXPECT_BRACE_KIND(Tokens[10], BK_Unknown); // Not BK_Block. } TEST_F(TokenAnnotatorTest, SwitchExpression) { diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 70fa32d621e2f1..f76d44f5479d32 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -478,6 +478,20 @@ void Fortran::lower::createGlobalInitialization( builder.restoreInsertionPoint(insertPt); } +static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) { + if (dataAttr) { + if (dataAttr.getValue() == cuf::DataAttribute::Pinned) + return kPinnedAllocatorPos; + if (dataAttr.getValue() == cuf::DataAttribute::Device) + return kDeviceAllocatorPos; + if (dataAttr.getValue() == cuf::DataAttribute::Managed) + return kManagedAllocatorPos; + if (dataAttr.getValue() == cuf::DataAttribute::Unified) + return kUnifiedAllocatorPos; + } + return kDefaultAllocator; +} + /// Create the global op and its init if it has one static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, const Fortran::lower::pft::Variable &var, @@ -540,8 +554,10 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, // Create unallocated/disassociated descriptor if no explicit init Fortran::lower::createGlobalInitialization( builder, global, [&](fir::FirOpBuilder &b) { - mlir::Value box = - fir::factory::createUnallocatedBox(b, loc, symTy, std::nullopt); + mlir::Value box = fir::factory::createUnallocatedBox( + b, loc, symTy, + /*nonDeferredParams=*/std::nullopt, + /*typeSourceBox=*/{}, getAllocatorIdx(dataAttr)); b.create(loc, box); }); } diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index fb72f88fe415ca..6479425c58d8be 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -2,6 +2,21 @@ ! Test lowering of CUDA allocatable allocate/deallocate statements. +module globals + real, device, allocatable :: a_device(:) + real, managed, allocatable :: a_managed(:) + real, pinned, allocatable :: a_pinned(:) +end module + +! CHECK-LABEL: fir.global @_QMglobalsEa_device {data_attr = #cuf.cuda} : !fir.box>> +! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 2 : i32} : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + +! CHECK-LABEL: fir.global @_QMglobalsEa_managed {data_attr = #cuf.cuda} : !fir.box>> +! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 3 : i32} : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + +! CHECK-LABEL: fir.global @_QMglobalsEa_pinned {data_attr = #cuf.cuda} : !fir.box>> +! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 1 : i32} : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + subroutine sub1() real, allocatable, device :: a(:) allocate(a(10)) diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 1742e1f7b0ef33..a4ae3e1ff7d9c6 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -1321,13 +1321,13 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"strtoul", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, FunctionSpec<"strtoull", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtof", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtod", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtold", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtol", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtoll", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtoul", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, - FunctionSpec<"strtoull", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtof_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtod_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtold_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtol_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoll_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoul_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, + FunctionSpec<"strtoull_l", RetValSpec, [ArgSpec, ArgSpec, ArgSpec, ArgSpec]>, FunctionSpec<"malloc", RetValSpec, [ArgSpec]>, FunctionSpec<"calloc", RetValSpec, [ArgSpec, ArgSpec]>, diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef index 1a4049e4d34f2d..592f6261a6de3f 100644 --- a/libcxx/include/cstddef +++ b/libcxx/include/cstddef @@ -66,8 +66,8 @@ using ::max_align_t _LIBCPP_USING_IF_EXISTS; _LIBCPP_END_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 -namespace std // purposefully not versioned -{ +namespace std { // purposefully not versioned + enum class byte : unsigned char {}; _LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept { @@ -127,7 +127,6 @@ template ::value, int> = 0> } } // namespace std - -#endif +#endif // _LIBCPP_STD_VER >= 17 #endif // _LIBCPP_CSTDDEF diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 13d0dce34d97e3..f193b5d95f49f5 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -245,8 +245,15 @@ module std_stdexcept [system] { header "stdexcept" export * } -module std_stop_token { +module std_stop_token [system] { header "stop_token" + private header "__stop_token/atomic_unique_lock.h" + private header "__stop_token/intrusive_list_view.h" + private header "__stop_token/intrusive_shared_ptr.h" + private header "__stop_token/stop_callback.h" + private header "__stop_token/stop_source.h" + private header "__stop_token/stop_state.h" + private header "__stop_token/stop_token.h" export * } module std_streambuf [system] { @@ -1592,41 +1599,25 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } -module std_private_pstl_backend [system] { +module std_private_pstl [system] { header "__pstl/backend.h" - export * -} -module std_private_pstl_backend_fwd [system] { header "__pstl/backend_fwd.h" - export * -} -module std_private_pstl_backends_default [system] { header "__pstl/backends/default.h" - export * -} -module std_private_pstl_backends_libdispatch [system] { header "__pstl/backends/libdispatch.h" - export * -} -module std_private_pstl_backends_serial [system] { header "__pstl/backends/serial.h" - export * -} -module std_private_pstl_backends_std_thread [system] { header "__pstl/backends/std_thread.h" - export * + header "__pstl/cpu_algos/any_of.h" + header "__pstl/cpu_algos/cpu_traits.h" + header "__pstl/cpu_algos/fill.h" + header "__pstl/cpu_algos/find_if.h" + header "__pstl/cpu_algos/for_each.h" + header "__pstl/cpu_algos/merge.h" + header "__pstl/cpu_algos/stable_sort.h" + header "__pstl/cpu_algos/transform.h" + header "__pstl/cpu_algos/transform_reduce.h" + header "__pstl/dispatch.h" + header "__pstl/handle_exception.h" } -module std_private_pstl_cpu_algos_any_of [system] { header "__pstl/cpu_algos/any_of.h" } -module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" } -module std_private_pstl_cpu_algos_fill [system] { header "__pstl/cpu_algos/fill.h" } -module std_private_pstl_cpu_algos_find_if [system] { header "__pstl/cpu_algos/find_if.h" } -module std_private_pstl_cpu_algos_for_each [system] { header "__pstl/cpu_algos/for_each.h" } -module std_private_pstl_cpu_algos_merge [system] { header "__pstl/cpu_algos/merge.h" } -module std_private_pstl_cpu_algos_stable_sort [system] { header "__pstl/cpu_algos/stable_sort.h" } -module std_private_pstl_cpu_algos_transform [system] { header "__pstl/cpu_algos/transform.h" } -module std_private_pstl_cpu_algos_transform_reduce [system] { header "__pstl/cpu_algos/transform_reduce.h" } -module std_private_pstl_dispatch [system] { header "__pstl/dispatch.h" } -module std_private_pstl_handle_exception [system] { header "__pstl/handle_exception.h" } module std_private_queue_fwd [system] { header "__fwd/queue.h" } @@ -1781,23 +1772,6 @@ module std_private_span_span_fwd [system] { header "__fwd/span.h" } module std_private_stack_fwd [system] { header "__fwd/stack.h" } -module std_private_stop_token_atomic_unique_lock [system] { header "__stop_token/atomic_unique_lock.h" } -module std_private_stop_token_intrusive_list_view [system] { header "__stop_token/intrusive_list_view.h" } -module std_private_stop_token_intrusive_shared_ptr [system] { header "__stop_token/intrusive_shared_ptr.h" } -module std_private_stop_token_stop_callback [system] { header "__stop_token/stop_callback.h" } -module std_private_stop_token_stop_source [system] { - header "__stop_token/stop_source.h" - export * -} -module std_private_stop_token_stop_state [system] { - header "__stop_token/stop_state.h" - export * -} -module std_private_stop_token_stop_token [system] { - header "__stop_token/stop_token.h" - export * -} - module std_private_string_char_traits [system] { header "__string/char_traits.h" export * diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp index 2a9b828f4389ce..44d51921ac74ad 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp @@ -5,12 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads - -// XFAIL: availability-synchronization_library-missing +// UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11, c++14, c++17 +// XFAIL: availability-synchronization_library-missing +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/atomic_unique_lock.h> #include diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp index 85cd9786258955..d8cd2fb68e132e 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp @@ -8,6 +8,7 @@ // // UNSUPPORTED: c++03, c++11, c++14, c++17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/intrusive_list_view.h> #include diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp index 47440015f2c50c..99d4226662a0b7 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp @@ -8,6 +8,7 @@ // // UNSUPPORTED: c++03, c++11, c++14, c++17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/intrusive_shared_ptr.h> #include diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index ae7bd3d2311f96..172824dc78a6bc 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -9,6 +9,7 @@ #ifndef LLDB_CORE_SOURCEMANAGER_H #define LLDB_CORE_SOURCEMANAGER_H +#include "lldb/Utility/Checksum.h" #include "lldb/Utility/FileSpec.h" #include "lldb/lldb-defines.h" #include "lldb/lldb-forward.h" @@ -71,6 +72,8 @@ class SourceManager { llvm::sys::TimePoint<> GetTimestamp() const { return m_mod_time; } + const Checksum &GetChecksum() const { return m_checksum; } + protected: /// Set file and update modification time. void SetSupportFile(lldb::SupportFileSP support_file_sp); @@ -81,6 +84,9 @@ class SourceManager { /// different from the original support file passed to the constructor. lldb::SupportFileSP m_support_file_sp; + /// Keep track of the on-disk checksum. + Checksum m_checksum; + // Keep the modification time that this file data is valid for llvm::sys::TimePoint<> m_mod_time; diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index c427bb91f4643a..f6e59ce731a573 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -447,13 +447,14 @@ void SourceManager::FindLinesMatchingRegex(SupportFileSP support_file_sp, SourceManager::File::File(SupportFileSP support_file_sp, lldb::DebuggerSP debugger_sp) - : m_support_file_sp(std::make_shared()), m_mod_time(), - m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) { + : m_support_file_sp(std::make_shared()), m_checksum(), + m_mod_time(), m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) { CommonInitializer(support_file_sp, {}); } SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp) - : m_support_file_sp(std::make_shared()), m_mod_time(), + : m_support_file_sp(std::make_shared()), m_checksum(), + m_mod_time(), m_debugger_wp(target_sp ? target_sp->GetDebugger().shared_from_this() : DebuggerSP()), m_target_wp(target_sp) { @@ -532,9 +533,11 @@ void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp, } // If the file exists, read in the data. - if (m_mod_time != llvm::sys::TimePoint<>()) + if (m_mod_time != llvm::sys::TimePoint<>()) { m_data_sp = FileSystem::Instance().CreateDataBuffer( m_support_file_sp->GetSpecOnly()); + m_checksum = llvm::MD5::hash(m_data_sp->GetData()); + } } void SourceManager::File::SetSupportFile(lldb::SupportFileSP support_file_sp) { @@ -835,14 +838,24 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile( return {}; } +static std::string toString(const Checksum &checksum) { + if (!checksum) + return ""; + return std::string(llvm::formatv("{0}", checksum.digest())); +} + void SourceManager::SourceFileCache::Dump(Stream &stream) const { - stream << "Modification time Lines Path\n"; - stream << "------------------- -------- --------------------------------\n"; + // clang-format off + stream << "Modification time MD5 Checksum (on-disk) MD5 Checksum (line table) Lines Path\n"; + stream << "------------------- -------------------------------- -------------------------------- -------- --------------------------------\n"; + // clang-format on for (auto &entry : m_file_cache) { if (!entry.second) continue; FileSP file = entry.second; - stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,8:d} {2}\n", file->GetTimestamp(), + stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,32} {2,32} {3,8:d} {4}\n", + file->GetTimestamp(), toString(file->GetChecksum()), + toString(file->GetSupportFile()->GetChecksum()), file->GetNumLines(), entry.first.GetPath()); } } diff --git a/llvm/cmake/platforms/WinMsvc.cmake b/llvm/cmake/platforms/WinMsvc.cmake index e5d1ba8ec4a7c2..40d47f12c53ab7 100644 --- a/llvm/cmake/platforms/WinMsvc.cmake +++ b/llvm/cmake/platforms/WinMsvc.cmake @@ -95,6 +95,7 @@ list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES LLVM_WINSYSROOT MSVC_VER WINSDK_VER + msvc_lib_symlinks_dir winsdk_lib_symlinks_dir winsdk_vfs_overlay_path ) @@ -156,6 +157,24 @@ function(generate_winsdk_lib_symlinks winsdk_um_lib_dir output_dir) endforeach() endfunction() +function(generate_msvc_lib_symlinks msvc_lib_dir output_dir) + execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${output_dir}") + file(GLOB libraries RELATIVE "${msvc_lib_dir}" "${msvc_lib_dir}/*.lib") + foreach(library ${libraries}) + get_filename_component(name_wle "${library}" NAME_WLE) + get_filename_component(ext "${library}" LAST_EXT) + string(TOLOWER "${ext}" lowercase_ext) + string(TOUPPER "${name_wle}" all_uppercase_symlink_name_wle) + set(uppercase_symlink_name "${all_uppercase_symlink_name_wle}${lowercase_ext}") + if(NOT library STREQUAL uppercase_symlink_name) + execute_process(COMMAND "${CMAKE_COMMAND}" + -E create_symlink + "${msvc_lib_dir}/${library}" + "${output_dir}/${uppercase_symlink_name}") + endif() + endforeach() +endfunction() + function(get_highest_version the_dir the_ver) file(GLOB entries LIST_DIRECTORIES true RELATIVE "${the_dir}" "${the_dir}/[0-9.]*") foreach(entry ${entries}) @@ -297,6 +316,12 @@ if(case_sensitive_filesystem) endif() list(APPEND LINK_FLAGS -libpath:"${winsdk_lib_symlinks_dir}") + if(NOT msvc_lib_symlinks_dir) + set(msvc_lib_symlinks_dir "${CMAKE_BINARY_DIR}/msvc_lib_symlinks") + generate_msvc_lib_symlinks("${MSVC_LIB}/${WINSDK_ARCH}" "${msvc_lib_symlinks_dir}") + endif() + list(APPEND LINK_FLAGS + -libpath:"${msvc_lib_symlinks_dir}") endif() string(REPLACE ";" " " LINK_FLAGS "${LINK_FLAGS}") diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 73047b4f119807..be3befc14ad72d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -438,8 +438,9 @@ void DwarfCompileUnit::addLocationAttribute( Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym)); // Base register Register BaseReg = Asm->getObjFileLowering().getStaticBase(); - BaseReg = Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false); - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + BaseReg); + unsigned DwarfBaseReg = + Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false); + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DwarfBaseReg); // Offset from base register addSInt(*Loc, dwarf::DW_FORM_sdata, 0); // Operation diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index b9732e816ea7e6..39a705599f90cc 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -351,30 +351,17 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) { bool MadeChange = false; - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) { - BasicBlock *BB = &*BBI; - ++BBI; - - BasicBlock::iterator Next; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Instruction &Inst = *I; - Next = std::next(I); - - if (processAtomicInstr(&Inst)) { - MadeChange = true; - - // Detect control flow change and resume iteration from the original - // block to inspect any newly inserted blocks. This allows incremental - // legalizaton of atomicrmw and cmpxchg. - if (BB != Next->getParent()) { - BBI = BB->getIterator(); - BBE = F.end(); - break; - } - } - } + SmallVector AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (Instruction &I : instructions(F)) + if (I.isAtomic() && !isa(&I)) + AtomicInsts.push_back(&I); + + for (auto *I : AtomicInsts) { + if (processAtomicInstr(I)) + MadeChange = true; } return MadeChange; diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 675d88d6d38cd9..5140f5951d6d3f 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include using namespace llvm; @@ -437,69 +438,33 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, default: llvm_unreachable("Impossible reduction kind"); case Intrinsic::vp_reduce_add: - Reduction = Builder.CreateAddReduce(RedOp); - Reduction = Builder.CreateAdd(Reduction, Start); - break; case Intrinsic::vp_reduce_mul: - Reduction = Builder.CreateMulReduce(RedOp); - Reduction = Builder.CreateMul(Reduction, Start); - break; case Intrinsic::vp_reduce_and: - Reduction = Builder.CreateAndReduce(RedOp); - Reduction = Builder.CreateAnd(Reduction, Start); - break; case Intrinsic::vp_reduce_or: - Reduction = Builder.CreateOrReduce(RedOp); - Reduction = Builder.CreateOr(Reduction, Start); - break; - case Intrinsic::vp_reduce_xor: - Reduction = Builder.CreateXorReduce(RedOp); - Reduction = Builder.CreateXor(Reduction, Start); - break; - case Intrinsic::vp_reduce_smax: - Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true); + case Intrinsic::vp_reduce_xor: { + Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); + unsigned Opc = getArithmeticReductionInstruction(RedID); + assert(Instruction::isBinaryOp(Opc)); + Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp); Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start); + Builder.CreateBinOp((Instruction::BinaryOps)Opc, Reduction, Start); break; + } + case Intrinsic::vp_reduce_smax: case Intrinsic::vp_reduce_smin: - Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start); - break; case Intrinsic::vp_reduce_umax: - Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start); - break; case Intrinsic::vp_reduce_umin: - Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start); - break; case Intrinsic::vp_reduce_fmax: - Reduction = Builder.CreateFPMaxReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start); - break; case Intrinsic::vp_reduce_fmin: - Reduction = Builder.CreateFPMinReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start); - break; case Intrinsic::vp_reduce_fmaximum: - Reduction = Builder.CreateFPMaximumReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::maximum, Reduction, Start); - break; - case Intrinsic::vp_reduce_fminimum: - Reduction = Builder.CreateFPMinimumReduce(RedOp); + case Intrinsic::vp_reduce_fminimum: { + Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); + Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RedID); + Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp); transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::minimum, Reduction, Start); + Reduction = Builder.CreateBinaryIntrinsic(ScalarID, Reduction, Start); break; + } case Intrinsic::vp_reduce_fadd: Reduction = Builder.CreateFAddReduce(Start, RedOp); break; diff --git a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp index e2b5ce49ba2ec1..cf9ed7dbff1536 100644 --- a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp +++ b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp @@ -60,6 +60,10 @@ class PerfJITEventListener : public JITEventListener { public: PerfJITEventListener(); ~PerfJITEventListener() { + // Lock a mutex to correctly synchronize with prior calls to + // `notifyObjectLoaded` and `notifyFreeingObject` that happened on other + // threads to prevent tsan from complaining. + std::lock_guard Guard(Mutex); if (MarkerAddr) CloseMarker(); } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 34c0fad45fc499..37add682b150e7 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -6947,14 +6947,10 @@ static void ExpandCryptoAEK(const AArch64::ArchInfo &ArchInfo, } } -static SMLoc incrementLoc(SMLoc L, int Offset) { - return SMLoc::getFromPointer(L.getPointer() + Offset); -} - /// parseDirectiveArch /// ::= .arch token bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { - SMLoc CurLoc = getLoc(); + SMLoc ArchLoc = getLoc(); StringRef Arch, ExtensionString; std::tie(Arch, ExtensionString) = @@ -6962,7 +6958,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { const AArch64::ArchInfo *ArchInfo = AArch64::parseArch(Arch); if (!ArchInfo) - return Error(CurLoc, "unknown arch name"); + return Error(ArchLoc, "unknown arch name"); if (parseToken(AsmToken::EndOfStatement)) return true; @@ -6982,30 +6978,27 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { ExtensionString.split(RequestedExtensions, '+'); ExpandCryptoAEK(*ArchInfo, RequestedExtensions); - CurLoc = incrementLoc(CurLoc, Arch.size()); + FeatureBitset Features = STI.getFeatureBits(); + setAvailableFeatures(ComputeAvailableFeatures(Features)); for (auto Name : RequestedExtensions) { - // Advance source location past '+'. - CurLoc = incrementLoc(CurLoc, 1); - bool EnableFeature = !Name.consume_front_insensitive("no"); - auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { - return Extension.Name == Name; - }); - - if (It == std::end(ExtensionMap)) - Error(CurLoc, "unsupported architectural extension: " + Name); + for (const auto &Extension : ExtensionMap) { + if (Extension.Name != Name) + continue; - if (EnableFeature) - STI.SetFeatureBitsTransitively(It->Features); - else - STI.ClearFeatureBitsTransitively(It->Features); + if (Extension.Features.none()) + report_fatal_error("unsupported architectural extension: " + Name); - CurLoc = incrementLoc(CurLoc, Name.size()); + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); + break; + } } - FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); - setAvailableFeatures(Features); return false; } @@ -7025,21 +7018,28 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { Name = Name.substr(2); } - auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { - return Extension.Name == Name; - }); + MCSubtargetInfo &STI = copySTI(); + FeatureBitset Features = STI.getFeatureBits(); + for (const auto &Extension : ExtensionMap) { + if (Extension.Name != Name) + continue; + + if (Extension.Features.none()) + return Error(ExtLoc, "unsupported architectural extension: " + Name); + + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); + return false; + } - if (It == std::end(ExtensionMap)) - return Error(ExtLoc, "unsupported architectural extension: " + Name); + return Error(ExtLoc, "unknown architectural extension: " + Name); +} - MCSubtargetInfo &STI = copySTI(); - if (EnableFeature) - STI.SetFeatureBitsTransitively(It->Features); - else - STI.ClearFeatureBitsTransitively(It->Features); - FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); - setAvailableFeatures(Features); - return false; +static SMLoc incrementLoc(SMLoc L, int Offset) { + return SMLoc::getFromPointer(L.getPointer() + Offset); } /// parseDirectiveCPU @@ -7075,22 +7075,30 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { bool EnableFeature = !Name.consume_front_insensitive("no"); - auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { - return Extension.Name == Name; - }); + bool FoundExtension = false; + for (const auto &Extension : ExtensionMap) { + if (Extension.Name != Name) + continue; - if (It == std::end(ExtensionMap)) - Error(CurLoc, "unsupported architectural extension: " + Name); + if (Extension.Features.none()) + report_fatal_error("unsupported architectural extension: " + Name); - if (EnableFeature) - STI.SetFeatureBitsTransitively(It->Features); - else - STI.ClearFeatureBitsTransitively(It->Features); + FeatureBitset Features = STI.getFeatureBits(); + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); + FoundExtension = true; + + break; + } + + if (!FoundExtension) + Error(CurLoc, "unsupported architectural extension"); CurLoc = incrementLoc(CurLoc, Name.size()); } - FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); - setAvailableFeatures(Features); return false; } diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h index f7bc6f958470b9..f07ae4c9baf1c6 100644 --- a/llvm/lib/Target/BPF/BPF.h +++ b/llvm/lib/Target/BPF/BPF.h @@ -28,6 +28,7 @@ FunctionPass *createBPFISelDag(BPFTargetMachine &TM); FunctionPass *createBPFMISimplifyPatchablePass(); FunctionPass *createBPFMIPeepholePass(); FunctionPass *createBPFMIPreEmitPeepholePass(); +FunctionPass *createBPFMIPreEmitCheckingPass(); InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &, const BPFSubtarget &, @@ -36,6 +37,7 @@ InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &, void initializeBPFCheckAndAdjustIRPass(PassRegistry&); void initializeBPFDAGToDAGISelLegacyPass(PassRegistry &); void initializeBPFMIPeepholePass(PassRegistry &); +void initializeBPFMIPreEmitCheckingPass(PassRegistry &); void initializeBPFMIPreEmitPeepholePass(PassRegistry &); void initializeBPFMISimplifyPatchablePass(PassRegistry &); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 4baeeb017699d6..6c750af5c2fd92 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -786,45 +786,13 @@ let Predicates = [BPFNoALU32] in { def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>; } -// Atomic XADD for BPFNoALU32 -class XADD - : TYPE_LD_ST { - bits<4> dst; - bits<20> addr; - - let Inst{51-48} = addr{19-16}; // base reg - let Inst{55-52} = dst; - let Inst{47-32} = addr{15-0}; // offset - let Inst{7-4} = BPF_ADD.Value; - let BPFClass = BPF_STX; -} - // Atomic add, and, or, xor -class ATOMIC_NOFETCH - : TYPE_LD_ST + : TYPE_LD_ST { - bits<4> dst; - bits<20> addr; - - let Inst{51-48} = addr{19-16}; // base reg - let Inst{55-52} = dst; - let Inst{47-32} = addr{15-0}; // offset - let Inst{7-4} = Opc.Value; - let BPFClass = BPF_STX; -} - -class ATOMIC32_NOFETCH - : TYPE_LD_ST { bits<4> dst; bits<20> addr; @@ -838,16 +806,23 @@ class ATOMIC32_NOFETCH let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { - def XADDW32 : ATOMIC32_NOFETCH; - def XANDW32 : ATOMIC32_NOFETCH; - def XORW32 : ATOMIC32_NOFETCH; - def XXORW32 : ATOMIC32_NOFETCH; + def XADDW32 : ATOMIC_NOFETCH; + def XANDW32 : ATOMIC_NOFETCH; + def XORW32 : ATOMIC_NOFETCH; + def XXORW32 : ATOMIC_NOFETCH; } + def XADDW : ATOMIC_NOFETCH; + def XADDD : ATOMIC_NOFETCH; + def XANDD : ATOMIC_NOFETCH; + def XORD : ATOMIC_NOFETCH; + def XXORD : ATOMIC_NOFETCH; +} - def XADDD : ATOMIC_NOFETCH; - def XANDD : ATOMIC_NOFETCH; - def XORD : ATOMIC_NOFETCH; - def XXORD : ATOMIC_NOFETCH; +let Predicates = [BPFNoALU32] in { + def : Pat<(atomic_load_add_i32 ADDRri:$addr, GPR:$val), + (XADDW ADDRri:$addr, GPR:$val)>; + def : Pat<(atomic_load_add_i64 ADDRri:$addr, GPR:$val), + (XADDD ADDRri:$addr, GPR:$val)>; } // Atomic Fetch-and- operations @@ -887,13 +862,6 @@ class XFALU32; - def XFADDW : XFALU64; - } -} - let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { def XFADDW32 : XFALU32; @@ -902,7 +870,9 @@ let Constraints = "$dst = $val" in { def XFXORW32 : XFALU32; } - def XFADDD : XFALU64; + let Predicates = [BPFHasALU32] in { + def XFADDD : XFALU64; + } def XFANDD : XFALU64; def XFORD : XFALU64; def XFXORD : XFALU64; diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp new file mode 100644 index 00000000000000..24224f6c1e9e66 --- /dev/null +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -0,0 +1,181 @@ +//===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass performs checking to signal errors for certain illegal usages at +// MachineInstruction layer. Specially, the result of XADD{32,64} insn should +// not be used. The pass is done at the PreEmit pass right before the +// machine code is emitted at which point the register liveness information +// is still available. +// +//===----------------------------------------------------------------------===// + +#include "BPF.h" +#include "BPFInstrInfo.h" +#include "BPFTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "bpf-mi-checking" + +namespace { + +struct BPFMIPreEmitChecking : public MachineFunctionPass { + + static char ID; + MachineFunction *MF; + const TargetRegisterInfo *TRI; + + BPFMIPreEmitChecking() : MachineFunctionPass(ID) { + initializeBPFMIPreEmitCheckingPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + void processAtomicInsts(); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (!skipFunction(MF.getFunction())) { + initialize(MF); + processAtomicInsts(); + } + return false; + } +}; + +// Initialize class variables. +void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) { + MF = &MFParm; + TRI = MF->getSubtarget().getRegisterInfo(); + LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n"); +} + +// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not +// used. +// +// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the +// source and destination operands of XADD are GPR32, there is no sub-register +// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we +// will raise false alarm on GPR32 Def. +// +// To support GPR32 Def, ideally we could just enable sub-registr liveness track +// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires +// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF. +// +// However, sub-register liveness tracking module inside LLVM is actually +// designed for the situation where one register could be split into more than +// one sub-registers for which case each sub-register could have their own +// liveness and kill one of them doesn't kill others. So, tracking liveness for +// each make sense. +// +// For BPF, each 64-bit register could only have one 32-bit sub-register. This +// is exactly the case which LLVM think brings no benefits for doing +// sub-register tracking, because the live range of sub-register must always +// equal to its parent register, therefore liveness tracking is disabled even +// the back-end has implemented enableSubRegLiveness. The detailed information +// is at r232695: +// +// Author: Matthias Braun +// Date: Thu Mar 19 00:21:58 2015 +0000 +// Do not track subregister liveness when it brings no benefits +// +// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo +// sub-register always has the same liveness as its parent register, LLVM is +// already attaching a implicit 64-bit register Def whenever the there is +// a sub-register Def. The liveness of the implicit 64-bit Def is available. +// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could +// be: +// +// $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0), +// implicit killed $r9, implicit-def dead $r9 +// +// Even though w9 is not marked as Dead, the parent register r9 is marked as +// Dead correctly, and it is safe to use such information or our purpose. +static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { + const MCRegisterClass *GPR64RegClass = + &BPFMCRegisterClasses[BPF::GPRRegClassID]; + std::vector GPR32LiveDefs; + std::vector GPR64DeadDefs; + + for (const MachineOperand &MO : MI.operands()) { + bool RegIsGPR64; + + if (!MO.isReg() || MO.isUse()) + continue; + + RegIsGPR64 = GPR64RegClass->contains(MO.getReg()); + if (!MO.isDead()) { + // It is a GPR64 live Def, we are sure it is live. */ + if (RegIsGPR64) + return true; + // It is a GPR32 live Def, we are unsure whether it is really dead due to + // no sub-register liveness tracking. Push it to vector for deferred + // check. + GPR32LiveDefs.push_back(MO.getReg()); + continue; + } + + // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its + // low 32-bit. + if (RegIsGPR64) + GPR64DeadDefs.push_back(MO.getReg()); + } + + // No GPR32 live Def, safe to return false. + if (GPR32LiveDefs.empty()) + return false; + + // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore + // must be truely live, safe to return true. + if (GPR64DeadDefs.empty()) + return true; + + // Otherwise, return true if any aliased SuperReg of GPR32 is not dead. + for (auto I : GPR32LiveDefs) + for (MCPhysReg SR : TRI->superregs(I)) + if (!llvm::is_contained(GPR64DeadDefs, SR)) + return true; + + return false; +} + +void BPFMIPreEmitChecking::processAtomicInsts() { + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD) + continue; + + LLVM_DEBUG(MI.dump()); + if (hasLiveDefs(MI, TRI)) { + DebugLoc Empty; + const DebugLoc &DL = MI.getDebugLoc(); + const Function &F = MF->getFunction(); + F.getContext().diagnose(DiagnosticInfoUnsupported{ + F, "Invalid usage of the XADD return value", DL}); + } + } + } +} + +} // namespace + +INITIALIZE_PASS(BPFMIPreEmitChecking, "bpf-mi-pemit-checking", + "BPF PreEmit Checking", false, false) + +char BPFMIPreEmitChecking::ID = 0; +FunctionPass *llvm::createBPFMIPreEmitCheckingPass() { + return new BPFMIPreEmitChecking(); +} diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index 64b115b8fc8afa..7d91fa8bb824cf 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -178,6 +178,7 @@ void BPFPassConfig::addMachineSSAOptimization() { } void BPFPassConfig::addPreEmitPass() { + addPass(createBPFMIPreEmitCheckingPass()); if (getOptLevel() != CodeGenOptLevel::None) if (!DisableMIPeephole) addPass(createBPFMIPreEmitPeepholePass()); diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt index 253660d4d62e37..eade4cacb7100e 100644 --- a/llvm/lib/Target/BPF/CMakeLists.txt +++ b/llvm/lib/Target/BPF/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(BPFCodeGen BPFSubtarget.cpp BPFTargetMachine.cpp BPFMIPeephole.cpp + BPFMIChecking.cpp BPFMISimplifyPatchable.cpp BTFDebug.cpp diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index e40981f5b5cd57..595ce9fc815bf0 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -65,7 +65,7 @@ static MCAsmInfo *createLoongArchMCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI = new LoongArchMCAsmInfo(TT); // Initial state of the frame pointer is sp(r3). - MCRegister SP = MRI.getDwarfRegNum(LoongArch::R3, true); + unsigned SP = MRI.getDwarfRegNum(LoongArch::R3, true); MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0); MAI->addInitialFrameState(Inst); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp index 6ad2c003558a51..08e5ccc7bc0be5 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp @@ -82,9 +82,9 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { TmpVal1 = Insts[1].Imm; if (N == 3) break; - TmpVal2 = Insts[3].Imm << 52 | TmpVal1; + TmpVal2 = static_cast(Insts[3].Imm) << 52 | TmpVal1; } - TmpVal1 |= Insts[0].Imm << 12; + TmpVal1 |= static_cast(Insts[0].Imm) << 12; break; case LoongArch::ORI: case LoongArch::ADDI_W: @@ -92,8 +92,9 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { break; } - for (uint64_t Msb = 32; Msb < 64; ++Msb) { - uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); + uint64_t Msb = 32; + uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); + for (; Msb < 64; ++Msb, HighMask = (HighMask << 1) + 1) { for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) { uint64_t LowMask = (1ULL << Lsb) - 1; uint64_t Mask = HighMask | LowMask; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index e051312d61a7bc..53329af093de0f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -71,7 +71,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI, const MCTargetOptions &Options) { MCAsmInfo *MAI = new RISCVMCAsmInfo(TT); - MCRegister SP = MRI.getDwarfRegNum(RISCV::X2, true); + unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true); MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0); MAI->addInitialFrameState(Inst); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index efdf6bebfce301..73649129e4f93f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -234,12 +234,12 @@ let RegAltNameIndices = [ABIRegAltName] in { foreach Index = 0-31 in { def F#Index#_F : RISCVReg32("F"#Index#"_H")>, - DwarfRegNum<[!add(Index, 32)]>; + DwarfRegAlias("F"#Index#"_H")>; } foreach Index = 0-31 in { def F#Index#_D : RISCVReg64("F"#Index#"_F")>, - DwarfRegNum<[!add(Index, 32)]>; + DwarfRegAlias("F"#Index#"_H")>; } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index da4b8d228f627d..9d17d90f530541 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -763,7 +763,7 @@ multiclass SIMDConditionInt baseInst> { multiclass SIMDConditionFP baseInst> { defm "" : SIMDCondition; defm "" : SIMDCondition; - defm "" : HalfPrecisionCondition; + defm "" : HalfPrecisionCondition; } // Equality: eq @@ -1218,7 +1218,7 @@ multiclass SIMDUnaryFP baseInst> { // Unlike F32x4 and F64x2 there's not a gap in the opcodes between "neg" and // "sqrt" so subtract one from the offset. defm "" : HalfPrecisionUnary; + !add(baseInst,!if(!eq(name, "sqrt"), 79, 80))>; } // Absolute value: abs @@ -1239,10 +1239,10 @@ defm CEIL : SIMDUnary; defm FLOOR : SIMDUnary; defm TRUNC: SIMDUnary; defm NEAREST: SIMDUnary; -defm CEIL : HalfPrecisionUnary; -defm FLOOR : HalfPrecisionUnary; -defm TRUNC : HalfPrecisionUnary; -defm NEAREST : HalfPrecisionUnary; +defm CEIL : HalfPrecisionUnary; +defm FLOOR : HalfPrecisionUnary; +defm TRUNC : HalfPrecisionUnary; +defm NEAREST : HalfPrecisionUnary; // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint. def : Pat<(v4f32 (frint (v4f32 V128:$src))), (NEAREST_F32x4 V128:$src)>; @@ -1261,7 +1261,7 @@ def : Pat<(v8f16 (froundeven (v8f16 V128:$src))), (NEAREST_F16x8 V128:$src)>; multiclass SIMDBinaryFP baseInst> { defm "" : SIMDBinary; defm "" : SIMDBinary; - defm "" : HalfPrecisionBinary; + defm "" : HalfPrecisionBinary; } // Addition: add @@ -1362,8 +1362,8 @@ multiclass HalfPrecisionConvert; defm "" : SIMDConvert; -defm "" : HalfPrecisionConvert; -defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; // Support the saturating variety as well. def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>; @@ -1394,8 +1394,8 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; -defm "" : HalfPrecisionConvert; -defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; // Extending operations // TODO: refactor this to be uniform for i64x2 if the numbering is not changed. @@ -1538,7 +1538,7 @@ multiclass SIMDMADD simdopA, bits<32> simdopS, list defm "" : SIMDMADD; defm "" : SIMDMADD; -defm "" : SIMDMADD; +defm "" : SIMDMADD; //===----------------------------------------------------------------------===// // Laneselect diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 66b68d5cd457fb..52def8f21312de 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -242,9 +242,16 @@ class CallsiteContextGraph { // recursion. bool Recursive = false; - // The corresponding allocation or interior call. + // The corresponding allocation or interior call. This is the primary call + // for which we have created this node. CallInfo Call; + // List of other calls that can be treated the same as the primary call + // through cloning. I.e. located in the same function and have the same + // (possibly pruned) stack ids. They will be updated the same way as the + // primary call when assigning to function clones. + std::vector MatchingCalls; + // For alloc nodes this is a unique id assigned when constructed, and for // callsite stack nodes it is the original stack id when the node is // constructed from the memprof MIB metadata on the alloc nodes. Note that @@ -457,6 +464,9 @@ class CallsiteContextGraph { /// iteration. MapVector> FuncToCallsWithMetadata; + /// Records the function each call is located in. + DenseMap CallToFunc; + /// Map from callsite node to the enclosing caller function. std::map NodeToCallingFunc; @@ -474,7 +484,8 @@ class CallsiteContextGraph { /// StackIdToMatchingCalls map. void assignStackNodesPostOrder( ContextNode *Node, DenseSet &Visited, - DenseMap> &StackIdToMatchingCalls); + DenseMap> &StackIdToMatchingCalls, + DenseMap &CallToMatchingCall); /// Duplicates the given set of context ids, updating the provided /// map from each original id with the newly generated context ids, @@ -521,6 +532,11 @@ class CallsiteContextGraph { Call, Func, CallerFunc, FoundCalleeChain); } + /// Returns true if both call instructions have the same callee. + bool sameCallee(CallTy Call1, CallTy Call2) { + return static_cast(this)->sameCallee(Call1, Call2); + } + /// Get a list of nodes corresponding to the stack ids in the given /// callsite's context. std::vector getStackIdsWithContextNodesForCall(CallTy Call) { @@ -667,6 +683,7 @@ class ModuleCallsiteContextGraph bool calleeMatchesFunc( Instruction *Call, const Function *Func, const Function *CallerFunc, std::vector> &FoundCalleeChain); + bool sameCallee(Instruction *Call1, Instruction *Call2); bool findProfiledCalleeThroughTailCalls( const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, std::vector> &FoundCalleeChain, @@ -744,6 +761,7 @@ class IndexCallsiteContextGraph IndexCall &Call, const FunctionSummary *Func, const FunctionSummary *CallerFunc, std::vector> &FoundCalleeChain); + bool sameCallee(IndexCall &Call1, IndexCall &Call2); bool findProfiledCalleeThroughTailCalls( ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, std::vector> &FoundCalleeChain, @@ -1230,10 +1248,11 @@ static void checkNode(const ContextNode *Node, template void CallsiteContextGraph:: - assignStackNodesPostOrder(ContextNode *Node, - DenseSet &Visited, - DenseMap> - &StackIdToMatchingCalls) { + assignStackNodesPostOrder( + ContextNode *Node, DenseSet &Visited, + DenseMap> + &StackIdToMatchingCalls, + DenseMap &CallToMatchingCall) { auto Inserted = Visited.insert(Node); if (!Inserted.second) return; @@ -1246,7 +1265,8 @@ void CallsiteContextGraph:: // Skip any that have been removed during the recursion. if (!Edge) continue; - assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); + assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls, + CallToMatchingCall); } // If this node's stack id is in the map, update the graph to contain new @@ -1289,8 +1309,19 @@ void CallsiteContextGraph:: auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; // Skip any for which we didn't assign any ids, these don't get a node in // the graph. - if (SavedContextIds.empty()) + if (SavedContextIds.empty()) { + // If this call has a matching call (located in the same function and + // having the same stack ids), simply add it to the context node created + // for its matching call earlier. These can be treated the same through + // cloning and get updated at the same time. + if (!CallToMatchingCall.contains(Call)) + continue; + auto MatchingCall = CallToMatchingCall[Call]; + assert(NonAllocationCallToContextNodeMap.contains(MatchingCall)); + NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back( + Call); continue; + } assert(LastId == Ids.back()); @@ -1422,6 +1453,10 @@ void CallsiteContextGraph::updateStackNodes() { // there is more than one call with the same stack ids. Their (possibly newly // duplicated) context ids are saved in the StackIdToMatchingCalls map. DenseMap> OldToNewContextIds; + // Save a map from each call to any that are found to match it. I.e. located + // in the same function and have the same (possibly pruned) stack ids. We use + // this to avoid creating extra graph nodes as they can be treated the same. + DenseMap CallToMatchingCall; for (auto &It : StackIdToMatchingCalls) { auto &Calls = It.getSecond(); // Skip single calls with a single stack id. These don't need a new node. @@ -1460,6 +1495,13 @@ void CallsiteContextGraph::updateStackNodes() { DenseSet LastNodeContextIds = LastNode->getContextIds(); assert(!LastNodeContextIds.empty()); + // Map from function to the first call from the below list (with matching + // stack ids) found in that function. Note that calls from different + // functions can have the same stack ids because this is the list of stack + // ids that had (possibly pruned) nodes after building the graph from the + // allocation MIBs. + DenseMap FuncToCallMap; + for (unsigned I = 0; I < Calls.size(); I++) { auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; assert(SavedContextIds.empty()); @@ -1533,6 +1575,18 @@ void CallsiteContextGraph::updateStackNodes() { continue; } + const FuncTy *CallFunc = CallToFunc[Call]; + + // If the prior call had the same stack ids this map would not be empty. + // Check if we already have a call that "matches" because it is located + // in the same function. + if (FuncToCallMap.contains(CallFunc)) { + // Record the matching call found for this call, and skip it. We + // will subsequently combine it into the same node. + CallToMatchingCall[Call] = FuncToCallMap[CallFunc]; + continue; + } + // Check if the next set of stack ids is the same (since the Calls vector // of tuples is sorted by the stack ids we can just look at the next one). bool DuplicateContextIds = false; @@ -1562,7 +1616,14 @@ void CallsiteContextGraph::updateStackNodes() { set_subtract(LastNodeContextIds, StackSequenceContextIds); if (LastNodeContextIds.empty()) break; - } + // No longer possibly in a sequence of calls with duplicate stack ids, + // clear the map. + FuncToCallMap.clear(); + } else + // Record the call with its function, so we can locate it the next time + // we find a call from this function when processing the calls with the + // same stack ids. + FuncToCallMap[CallFunc] = Call; } } @@ -1579,7 +1640,8 @@ void CallsiteContextGraph::updateStackNodes() { // associated context ids over to the new nodes. DenseSet Visited; for (auto &Entry : AllocationCallToContextNodeMap) - assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); + assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls, + CallToMatchingCall); if (VerifyCCG) check(); } @@ -1679,6 +1741,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( continue; if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { CallsWithMetadata.push_back(&I); + CallToFunc[&I] = &F; auto *AllocNode = addAllocNode(&I, &F); auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); assert(CallsiteMD); @@ -1700,8 +1763,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( I.setMetadata(LLVMContext::MD_callsite, nullptr); } // For callsite metadata, add to list for this function for later use. - else if (I.getMetadata(LLVMContext::MD_callsite)) + else if (I.getMetadata(LLVMContext::MD_callsite)) { CallsWithMetadata.push_back(&I); + CallToFunc[&I] = &F; + } } } if (!CallsWithMetadata.empty()) @@ -1756,8 +1821,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( // correlate properly in applyImport in the backends. if (AN.MIBs.empty()) continue; - CallsWithMetadata.push_back({&AN}); - auto *AllocNode = addAllocNode({&AN}, FS); + IndexCall AllocCall(&AN); + CallsWithMetadata.push_back(AllocCall); + CallToFunc[AllocCall] = FS; + auto *AllocNode = addAllocNode(AllocCall, FS); // Pass an empty CallStack to the CallsiteContext (second) // parameter, since for ThinLTO we already collapsed out the inlined // stack ids on the allocation call during ModuleSummaryAnalysis. @@ -1788,8 +1855,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( } // For callsite metadata, add to list for this function for later use. if (!FS->callsites().empty()) - for (auto &SN : FS->mutableCallsites()) - CallsWithMetadata.push_back({&SN}); + for (auto &SN : FS->mutableCallsites()) { + IndexCall StackNodeCall(&SN); + CallsWithMetadata.push_back(StackNodeCall); + CallToFunc[StackNodeCall] = FS; + } if (!CallsWithMetadata.empty()) FuncToCallsWithMetadata[FS] = CallsWithMetadata; @@ -1829,26 +1899,76 @@ void CallsiteContextGraph TailCallToContextNodeMap; + std::vector> NewCallToNode; for (auto &Entry : NonAllocationCallToContextNodeMap) { auto *Node = Entry.second; assert(Node->Clones.empty()); // Check all node callees and see if in the same function. - auto Call = Node->Call.call(); - for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); - ++EI) { - auto Edge = *EI; - if (!Edge->Callee->hasCall()) - continue; - assert(NodeToCallingFunc.count(Edge->Callee)); - // Check if the called function matches that of the callee node. - if (calleesMatch(Call, EI, TailCallToContextNodeMap)) - continue; + // We need to check all of the calls recorded in this Node, because in some + // cases we may have had multiple calls with the same debug info calling + // different callees. This can happen, for example, when an object is + // constructed in the paramter list - the destructor call of the object has + // the same debug info (line/col) as the call the object was passed to. + // Here we will prune any that don't match all callee nodes. + std::vector AllCalls; + AllCalls.reserve(Node->MatchingCalls.size() + 1); + AllCalls.push_back(Node->Call); + AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(), + Node->MatchingCalls.end()); + auto It = AllCalls.begin(); + // Iterate through the calls until we find the first that matches. + for (; It != AllCalls.end(); ++It) { + auto ThisCall = *It; + bool Match = true; + for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); + ++EI) { + auto Edge = *EI; + if (!Edge->Callee->hasCall()) + continue; + assert(NodeToCallingFunc.count(Edge->Callee)); + // Check if the called function matches that of the callee node. + if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) { + Match = false; + break; + } + } + // Found a call that matches the callee nodes, we can quit now. + if (Match) { + // If the first match is not the primary call on the Node, update it + // now. We will update the list of matching calls further below. + if (Node->Call != ThisCall) { + Node->setCall(ThisCall); + // We need to update the NonAllocationCallToContextNodeMap, but don't + // want to do this during iteration over that map, so save the calls + // that need updated entries. + NewCallToNode.push_back({ThisCall, Node}); + // We should only have shared this node between calls from the same + // function. + assert(NodeToCallingFunc[Node] == CallToFunc[Node->Call]); + } + break; + } + } + // We will update this list below (or leave it cleared if there was no + // match found above). + Node->MatchingCalls.clear(); + // If we hit the end of the AllCalls vector, no call matching the callee + // nodes was found, clear the call information in the node. + if (It == AllCalls.end()) { RemovedEdgesWithMismatchedCallees++; // Work around by setting Node to have a null call, so it gets // skipped during cloning. Otherwise assignFunctions will assert // because its data structures are not designed to handle this case. Node->setCall(CallInfo()); - break; + continue; + } + // Now add back any matching calls that call the same function as the + // matching primary call on Node. + for (++It; It != AllCalls.end(); ++It) { + auto ThisCall = *It; + if (!sameCallee(Node->Call.call(), ThisCall.call())) + continue; + Node->MatchingCalls.push_back(ThisCall); } } @@ -1856,8 +1976,14 @@ void CallsiteContextGraphhasCall(); }); + // Also remove any entries if we updated the node's primary call above. + NonAllocationCallToContextNodeMap.remove_if([](const auto &it) { + return !it.second->hasCall() || it.second->Call != it.first; + }); + + // Add entries for any new primary calls recorded above. + for (auto &[Call, Node] : NewCallToNode) + NonAllocationCallToContextNodeMap[Call] = Node; // Add the new nodes after the above loop so that the iteration is not // invalidated. @@ -2083,6 +2209,21 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc( return true; } +bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1, + Instruction *Call2) { + auto *CB1 = cast(Call1); + if (!CB1->getCalledOperand() || CB1->isIndirectCall()) + return false; + auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts(); + auto *CalleeFunc1 = dyn_cast(CalleeVal1); + auto *CB2 = cast(Call2); + if (!CB2->getCalledOperand() || CB2->isIndirectCall()) + return false; + auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts(); + auto *CalleeFunc2 = dyn_cast(CalleeVal2); + return CalleeFunc1 == CalleeFunc2; +} + bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, std::vector> &FoundCalleeChain, @@ -2209,6 +2350,14 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc( return true; } +bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) { + ValueInfo Callee1 = + dyn_cast_if_present(Call1.getBase())->Callee; + ValueInfo Callee2 = + dyn_cast_if_present(Call2.getBase())->Callee; + return Callee1 == Callee2; +} + template void CallsiteContextGraph::ContextNode::dump() const { @@ -2225,6 +2374,14 @@ void CallsiteContextGraph::ContextNode::print( if (Recursive) OS << " (recursive)"; OS << "\n"; + if (!MatchingCalls.empty()) { + OS << "\tMatchingCalls:\n"; + for (auto &MatchingCall : MatchingCalls) { + OS << "\t"; + MatchingCall.print(OS); + OS << "\n"; + } + } OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; OS << "\tContextIds:"; // Make a copy of the computed context ids that we can sort for stability. @@ -2478,6 +2635,7 @@ CallsiteContextGraph::moveEdgeToNewCalleeClone( std::make_unique(Node->IsAllocation, Node->Call)); ContextNode *Clone = NodeOwner.back().get(); Node->addClone(Clone); + Clone->MatchingCalls = Node->MatchingCalls; assert(NodeToCallingFunc.count(Node)); NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true, @@ -3021,6 +3179,14 @@ bool CallsiteContextGraph::assignFunctions() { if (CallMap.count(Call)) CallClone = CallMap[Call]; CallsiteClone->setCall(CallClone); + // Need to do the same for all matching calls. + for (auto &MatchingCall : Node->MatchingCalls) { + CallInfo CallClone(MatchingCall); + if (CallMap.count(MatchingCall)) + CallClone = CallMap[MatchingCall]; + // Updates the call in the list. + MatchingCall = CallClone; + } }; // Keep track of the clones of callsite Node that need to be assigned to @@ -3187,6 +3353,16 @@ bool CallsiteContextGraph::assignFunctions() { CallInfo NewCall(CallMap[OrigCall]); assert(NewCall); NewClone->setCall(NewCall); + // Need to do the same for all matching calls. + for (auto &MatchingCall : NewClone->MatchingCalls) { + CallInfo OrigMatchingCall(MatchingCall); + OrigMatchingCall.setCloneNo(0); + assert(CallMap.count(OrigMatchingCall)); + CallInfo NewCall(CallMap[OrigMatchingCall]); + assert(NewCall); + // Updates the call in the list. + MatchingCall = NewCall; + } } } // Fall through to handling below to perform the recording of the @@ -3373,6 +3549,7 @@ bool CallsiteContextGraph::assignFunctions() { if (Node->IsAllocation) { updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + assert(Node->MatchingCalls.empty()); return; } @@ -3381,6 +3558,9 @@ bool CallsiteContextGraph::assignFunctions() { auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; updateCall(Node->Call, CalleeFunc); + // Update all the matching calls as well. + for (auto &Call : Node->MatchingCalls) + updateCall(Call, CalleeFunc); }; // Performs DFS traversal starting from allocation nodes to update calls to diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index a49d3b0b990bc7..8a8d8afece6cb4 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1210,6 +1210,11 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src, Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, RecurKind RdxKind) { auto *SrcVecEltTy = cast(Src->getType())->getElementType(); + auto getIdentity = [&]() { + Intrinsic::ID ID = getReductionIntrinsicID(RdxKind); + unsigned Opc = getArithmeticReductionInstruction(ID); + return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy); + }; switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -1227,10 +1232,9 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); case RecurKind::FMulAdd: case RecurKind::FAdd: - return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), - Src); + return Builder.CreateFAddReduce(getIdentity(), Src); case RecurKind::FMul: - return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src); + return Builder.CreateFMulReduce(getIdentity(), Src); default: llvm_unreachable("Unhandled opcode"); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f84317ba51257a..c9cee652d2d326 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1802,18 +1802,18 @@ void VPReductionRecipe::execute(VPTransformState &State) { (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, NewVecOp); PrevInChain = NewRed; + NextInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), + NewRed, PrevInChain); + else + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, + PrevInChain); } - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), - NewRed, PrevInChain); - } else if (IsOrdered) - NextInChain = NewRed; - else - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); State.set(this, NextInChain, Part, /*IsScalar*/ true); } } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index ed9c1b037d0cc7..0d230bb9dcc6e9 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __adddf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 888b795876f7df..bfe0d20ca814bc 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmax -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index a3665c6e428608..6b7d2df044460a 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmin -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 7725ce0e731859..67e164037d5ce7 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __subdf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/BPF/atomics.ll b/llvm/test/CodeGen/BPF/atomics.ll index 0c16c49f2a873b..c17b94af5f7bd9 100644 --- a/llvm/test/CodeGen/BPF/atomics.ll +++ b/llvm/test/CodeGen/BPF/atomics.ll @@ -1,10 +1,11 @@ -; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck --check-prefixes=CHECK,CHECK-V2 %s -; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefixes=CHECK,CHECK-V3 %s +; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck %s +; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefix=CHECK-V3 %s ; CHECK-LABEL: test_load_add_32 -; CHECK-V2: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2) +; CHECK: lock *(u32 *)(r1 + 0) += r2 +; CHECK: encoding: [0xc3,0x21 ; CHECK-V3: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) -; CHECK: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00] +; CHECK-V3: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00] define void @test_load_add_32(ptr %p, i32 zeroext %v) { entry: atomicrmw add ptr %p, i32 %v seq_cst @@ -12,8 +13,10 @@ entry: } ; CHECK-LABEL: test_load_add_64 -; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) -; CHECK: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00] +; CHECK: lock *(u64 *)(r1 + 0) += r2 +; CHECK: encoding: [0xdb,0x21 +; CHECK-V3: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) +; CHECK-V3: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00] define void @test_load_add_64(ptr %p, i64 zeroext %v) { entry: atomicrmw add ptr %p, i64 %v seq_cst diff --git a/llvm/test/CodeGen/BPF/atomics_2.ll b/llvm/test/CodeGen/BPF/atomics_2.ll index c670ddb05b6a77..6371e3b875638e 100644 --- a/llvm/test/CodeGen/BPF/atomics_2.ll +++ b/llvm/test/CodeGen/BPF/atomics_2.ll @@ -224,7 +224,7 @@ entry: } ; CHECK-LABEL: test_atomic_xor_64 -; CHECK: r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2) +; CHECK: atomic_fetch_xor((u64 *)(r1 + 0), r2) ; CHECK: encoding: [0xdb,0x21,0x00,0x00,0xa1,0x00,0x00,0x00] ; CHECK: w0 = 0 define dso_local i32 @test_atomic_xor_64(ptr nocapture %p, i64 %v) local_unnamed_addr { diff --git a/llvm/test/CodeGen/BPF/objdump_atomics.ll b/llvm/test/CodeGen/BPF/objdump_atomics.ll index c4cb16b2c36418..fcc889ba300e39 100644 --- a/llvm/test/CodeGen/BPF/objdump_atomics.ll +++ b/llvm/test/CodeGen/BPF/objdump_atomics.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: test_load_add_32 ; CHECK: c3 21 -; CHECK: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) +; CHECK: lock *(u32 *)(r1 + 0) += w2 define void @test_load_add_32(ptr %p, i32 zeroext %v) { entry: atomicrmw add ptr %p, i32 %v seq_cst @@ -11,7 +11,7 @@ entry: ; CHECK-LABEL: test_load_add_64 ; CHECK: db 21 -; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) +; CHECK: lock *(u64 *)(r1 + 0) += r2 define void @test_load_add_64(ptr %p, i64 zeroext %v) { entry: atomicrmw add ptr %p, i64 %v seq_cst diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll new file mode 100644 index 00000000000000..5aeeb9baf7b892 --- /dev/null +++ b/llvm/test/CodeGen/BPF/xadd.ll @@ -0,0 +1,59 @@ +; RUN: not llc -march=bpfel < %s 2>&1 | FileCheck %s +; RUN: not llc -march=bpfeb < %s 2>&1 | FileCheck %s + +; This file is generated with the source command and source +; $ clang -target bpf -O2 -g -S -emit-llvm t.c +; $ cat t.c +; int test(int *ptr) { +; int r; +; __sync_fetch_and_add(ptr, 4); +; r = __sync_fetch_and_add(ptr, 6); +; return r; +; } + +; ModuleID = 't.c' +source_filename = "t.c" +target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128" +target triple = "bpf" + +; Function Attrs: nounwind +define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 { +entry: + call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15 + %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16 + %1 = atomicrmw add ptr %ptr, i32 6 seq_cst, !dbg !17 +; CHECK: in function test i32 (ptr): Invalid usage of the XADD return value + call void @llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !18 + ret i32 %1, !dbg !19 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "t.c", directory: "/home/yhs/work/tests/llvm/sync/test1") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !11} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "ptr", arg: 1, scope: !7, file: !1, line: 1, type: !11) +!14 = !DILocalVariable(name: "r", scope: !7, file: !1, line: 2, type: !10) +!15 = !DILocation(line: 1, column: 15, scope: !7) +!16 = !DILocation(line: 3, column: 4, scope: !7) +!17 = !DILocation(line: 4, column: 8, scope: !7) +!18 = !DILocation(line: 2, column: 8, scope: !7) +!19 = !DILocation(line: 5, column: 4, scope: !7) diff --git a/llvm/test/CodeGen/BPF/xadd_legal.ll b/llvm/test/CodeGen/BPF/xadd_legal.ll index 88f04d85a779f8..9b07afade3fee9 100644 --- a/llvm/test/CodeGen/BPF/xadd_legal.ll +++ b/llvm/test/CodeGen/BPF/xadd_legal.ll @@ -19,7 +19,7 @@ define dso_local i32 @test(ptr nocapture %ptr, i64 %a) { entry: %conv = trunc i64 %a to i32 %0 = atomicrmw add ptr %ptr, i32 %conv seq_cst -; CHECK-64: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2) +; CHECK-64: lock *(u32 *)(r1 + 0) += r2 ; CHECK-32: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) %1 = load i32, ptr %ptr, align 4 ret i32 %1 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 59981a282ab43e..f00829530bb97e 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -4,21 +4,21 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zfh -verify-machineinstrs \ ; RUN: -target-abi lp64f < %s | FileCheck -check-prefix=CHECKIZFH %s ; RUN: llc -mtriple=riscv32 -mattr=+zhinx -verify-machineinstrs \ -; RUN: -target-abi ilp32 < %s | FileCheck -check-prefix=CHECK-ZHINX %s +; RUN: -target-abi ilp32 < %s | FileCheck -check-prefix=CHECKIZHINX %s ; RUN: llc -mtriple=riscv64 -mattr=+zhinx -verify-machineinstrs \ -; RUN: -target-abi lp64 < %s | FileCheck -check-prefix=CHECK-ZHINX %s +; RUN: -target-abi lp64 < %s | FileCheck -check-prefix=CHECKIZHINX %s ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I %s ; RUN: llc -mtriple=riscv32 -mattr=+zfhmin -verify-machineinstrs \ -; RUN: -target-abi ilp32f < %s | FileCheck -check-prefixes=CHECKIZFHMIN,CHECK-RV32-FSGNJ %s +; RUN: -target-abi ilp32f < %s | FileCheck -check-prefixes=CHECKIZFHMIN,RV32IZFHMIN %s ; RUN: llc -mtriple=riscv64 -mattr=+zfhmin -verify-machineinstrs \ -; RUN: -target-abi lp64f < %s | FileCheck --check-prefixes=CHECKIZFHMIN,CHECK-RV64-FSGNJ %s +; RUN: -target-abi lp64f < %s | FileCheck --check-prefixes=CHECKIZFHMIN,RV64IZFHMIN %s ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \ -; RUN: -target-abi ilp32 < %s | FileCheck --check-prefixes=CHECKZHINXMIN %s +; RUN: -target-abi ilp32 < %s | FileCheck --check-prefixes=CHECKIZHINXMIN,RV32IZHINXMIN %s ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \ -; RUN: -target-abi lp64 < %s | FileCheck --check-prefixes=CHECKZHINXMIN %s +; RUN: -target-abi lp64 < %s | FileCheck --check-prefixes=CHECKIZHINXMIN,RV64IZHINXMIN %s ; These tests are each targeted at a particular RISC-V FPU instruction. ; Compares and conversions can be found in half-fcmp.ll and half-convert.ll @@ -31,10 +31,10 @@ define half @fadd_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fadd.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fadd_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fadd_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fadd_s: ; RV32I: # %bb.0: @@ -96,20 +96,13 @@ define half @fadd_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fadd_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fadd_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fadd_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = fadd half %a, %b ret half %1 } @@ -120,10 +113,10 @@ define half @fsub_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fsub.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fsub_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fsub.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fsub_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fsub.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fsub_s: ; RV32I: # %bb.0: @@ -185,20 +178,13 @@ define half @fsub_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fsub_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fsub_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fsub_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fsub.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = fsub half %a, %b ret half %1 } @@ -209,10 +195,10 @@ define half @fmul_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fmul.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmul_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmul.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmul_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmul.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmul_s: ; RV32I: # %bb.0: @@ -274,20 +260,13 @@ define half @fmul_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmul_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmul_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmul_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = fmul half %a, %b ret half %1 } @@ -298,10 +277,10 @@ define half @fdiv_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fdiv.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fdiv_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fdiv.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fdiv_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fdiv.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fdiv_s: ; RV32I: # %bb.0: @@ -363,20 +342,13 @@ define half @fdiv_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fdiv_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fdiv.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fdiv_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fdiv.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fdiv_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fdiv.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = fdiv half %a, %b ret half %1 } @@ -389,10 +361,10 @@ define half @fsqrt_s(half %a) nounwind { ; CHECKIZFH-NEXT: fsqrt.h fa0, fa0 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fsqrt_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fsqrt.h a0, a0 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fsqrt_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fsqrt.h a0, a0 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fsqrt_s: ; RV32I: # %bb.0: @@ -427,18 +399,12 @@ define half @fsqrt_s(half %a) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fsqrt_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fsqrt.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fsqrt_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fsqrt.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fsqrt_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fsqrt.s a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = call half @llvm.sqrt.f16(half %a) ret half %1 } @@ -451,10 +417,10 @@ define half @fsgnj_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fsgnj.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fsgnj_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fsgnj.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fsgnj_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fsgnj.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fsgnj_s: ; RV32I: # %bb.0: @@ -474,79 +440,65 @@ define half @fsgnj_s(half %a, half %b) nounwind { ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fsgnj_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fsh fa1, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa0, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a1, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: andi a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: andi a1, a1, 127 -; CHECK-RV32-FSGNJ-NEXT: or a0, a1, a0 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa0, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fsgnj_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fsh fa1, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa0, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a1, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: andi a1, a1, 127 -; CHECK-RV64-FSGNJ-NEXT: or a0, a1, a0 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa0, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fsgnj_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: addi sp, sp, -16 -; CHECK-ZHINXMIN-NEXT: addi a2, sp, 12 -; CHECK-ZHINXMIN-NEXT: sh a1, 0(a2) -; CHECK-ZHINXMIN-NEXT: addi a1, sp, 8 -; CHECK-ZHINXMIN-NEXT: sh a0, 0(a1) -; CHECK-ZHINXMIN-NEXT: lbu a0, 13(sp) -; CHECK-ZHINXMIN-NEXT: lbu a2, 9(sp) -; CHECK-ZHINXMIN-NEXT: andi a0, a0, 128 -; CHECK-ZHINXMIN-NEXT: andi a2, a2, 127 -; CHECK-ZHINXMIN-NEXT: or a0, a2, a0 -; CHECK-ZHINXMIN-NEXT: sb a0, 9(sp) -; CHECK-ZHINXMIN-NEXT: lh a0, 0(a1) -; CHECK-ZHINXMIN-NEXT: addi sp, sp, 16 -; CHECK-ZHINXMIN-NEXT: ret -; CHECKFSGNJ-LABEL: fsgnj_s: -; CHECKFSGNJ: # %bb.0: -; CHECKFSGNJ-NEXT: addi sp, sp, -16 -; CHECKFSGNJ-NEXT: fsh fa1, 12(sp) -; CHECKFSGNJ-NEXT: fsh fa0, 8(sp) -; CHECKFSGNJ-NEXT: lbu a0, 13(sp) -; CHECKFSGNJ-NEXT: lbu a1, 9(sp) -; CHECKFSGNJ-NEXT: andi a0, a0, 128 -; CHECKFSGNJ-NEXT: andi a1, a1, 127 -; CHECKFSGNJ-NEXT: or a0, a1, a0 -; CHECKFSGNJ-NEXT: sb a0, 9(sp) -; CHECKFSGNJ-NEXT: flh fa0, 8(sp) -; CHECKFSGNJ-NEXT: addi sp, sp, 16 -; CHECKFSGNJ-NEXT: ret -; CHECK64FSGNJ-LABEL: fsgnj_s: -; CHECK64FSGNJ: # %bb.0: -; CHECK64FSGNJ-NEXT: addi sp, sp, -16 -; CHECK64FSGNJ-NEXT: fsh fa1, 8(sp) -; CHECK64FSGNJ-NEXT: fsh fa0, 0(sp) -; CHECK64FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK64FSGNJ-NEXT: lbu a1, 1(sp) -; CHECK64FSGNJ-NEXT: andi a0, a0, 128 -; CHECK64FSGNJ-NEXT: andi a1, a1, 127 -; CHECK64FSGNJ-NEXT: or a0, a1, a0 -; CHECK64FSGNJ-NEXT: sb a0, 1(sp) -; CHECK64FSGNJ-NEXT: flh fa0, 0(sp) -; CHECK64FSGNJ-NEXT: addi sp, sp, 16 -; CHECK64FSGNJ-NEXT: ret +; RV32IZFHMIN-LABEL: fsgnj_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fsh fa1, 12(sp) +; RV32IZFHMIN-NEXT: fsh fa0, 8(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: lbu a1, 9(sp) +; RV32IZFHMIN-NEXT: andi a0, a0, 128 +; RV32IZFHMIN-NEXT: andi a1, a1, 127 +; RV32IZFHMIN-NEXT: or a0, a1, a0 +; RV32IZFHMIN-NEXT: sb a0, 9(sp) +; RV32IZFHMIN-NEXT: flh fa0, 8(sp) +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fsgnj_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fsh fa1, 8(sp) +; RV64IZFHMIN-NEXT: fsh fa0, 0(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: lbu a1, 1(sp) +; RV64IZFHMIN-NEXT: andi a0, a0, 128 +; RV64IZFHMIN-NEXT: andi a1, a1, 127 +; RV64IZFHMIN-NEXT: or a0, a1, a0 +; RV64IZFHMIN-NEXT: sb a0, 1(sp) +; RV64IZFHMIN-NEXT: flh fa0, 0(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fsgnj_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: sh a1, 12(sp) +; RV32IZHINXMIN-NEXT: sh a0, 8(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 13(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV32IZHINXMIN-NEXT: andi a0, a0, 128 +; RV32IZHINXMIN-NEXT: andi a1, a1, 127 +; RV32IZHINXMIN-NEXT: or a0, a1, a0 +; RV32IZHINXMIN-NEXT: sb a0, 9(sp) +; RV32IZHINXMIN-NEXT: lh a0, 8(sp) +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fsgnj_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: sh a1, 8(sp) +; RV64IZHINXMIN-NEXT: sh a0, 0(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 1(sp) +; RV64IZHINXMIN-NEXT: andi a0, a0, 128 +; RV64IZHINXMIN-NEXT: andi a1, a1, 127 +; RV64IZHINXMIN-NEXT: or a0, a1, a0 +; RV64IZHINXMIN-NEXT: sb a0, 1(sp) +; RV64IZHINXMIN-NEXT: lh a0, 0(sp) +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %1 = call half @llvm.copysign.f16(half %a, half %b) ret half %1 } @@ -561,12 +513,12 @@ define i32 @fneg_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: feq.h a0, fa5, fa4 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fneg_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, a0 -; CHECK-ZHINX-NEXT: fneg.h a1, a0 -; CHECK-ZHINX-NEXT: feq.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fneg_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, a0 +; CHECKIZHINX-NEXT: fneg.h a1, a0 +; CHECKIZHINX-NEXT: feq.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fneg_s: ; RV32I: # %bb.0: @@ -630,50 +582,73 @@ define i32 @fneg_s(half %a, half %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fneg_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa4, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV32-FSGNJ-NEXT: feq.s a0, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fneg_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa4, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV64-FSGNJ-NEXT: feq.s a0, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fneg_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: feq.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fneg_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa4, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa4 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fneg_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa4, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa4 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fneg_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fadd.s a0, a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: sh a0, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 13(sp) +; RV32IZHINXMIN-NEXT: xori a1, a1, 128 +; RV32IZHINXMIN-NEXT: sb a1, 13(sp) +; RV32IZHINXMIN-NEXT: lh a1, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: feq.s a0, a0, a1 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fneg_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fadd.s a0, a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: sh a0, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV64IZHINXMIN-NEXT: xori a1, a1, 128 +; RV64IZHINXMIN-NEXT: sb a1, 9(sp) +; RV64IZHINXMIN-NEXT: lh a1, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: feq.s a0, a0, a1 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %1 = fadd half %a, %a %2 = fneg half %1 %3 = fcmp oeq half %1, %2 @@ -690,11 +665,11 @@ define half @fsgnjn_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fsgnjn.h fa0, fa0, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fsgnjn_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a1, a0, a1 -; CHECK-ZHINX-NEXT: fsgnjn.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fsgnjn_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a1, a0, a1 +; CHECKIZHINX-NEXT: fsgnjn.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fsgnjn_s: ; RV32I: # %bb.0: @@ -774,118 +749,101 @@ define half @fsgnjn_s(half %a, half %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fsgnjn_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa0 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 4(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 5(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 5(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 4(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa0, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a1, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: andi a0, a0, 127 -; CHECK-RV32-FSGNJ-NEXT: andi a1, a1, 128 -; CHECK-RV32-FSGNJ-NEXT: or a0, a0, a1 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa0, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fsgnjn_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -32 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa0 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa0, 16(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 24(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 17(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a1, 25(sp) -; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 127 -; CHECK-RV64-FSGNJ-NEXT: andi a1, a1, 128 -; CHECK-RV64-FSGNJ-NEXT: or a0, a0, a1 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 17(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa0, 16(sp) -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 32 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fsgnjn_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: addi sp, sp, -16 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a2, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: addi a2, sp, 8 -; CHECK-ZHINXMIN-NEXT: sh a0, 0(a2) -; CHECK-ZHINXMIN-NEXT: addi a0, sp, 12 -; CHECK-ZHINXMIN-NEXT: sh a1, 0(a0) -; CHECK-ZHINXMIN-NEXT: lbu a0, 9(sp) -; CHECK-ZHINXMIN-NEXT: lbu a1, 13(sp) -; CHECK-ZHINXMIN-NEXT: andi a0, a0, 127 -; CHECK-ZHINXMIN-NEXT: andi a1, a1, 128 -; CHECK-ZHINXMIN-NEXT: or a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: sb a0, 9(sp) -; CHECK-ZHINXMIN-NEXT: lh a0, 0(a2) -; CHECK-ZHINXMIN-NEXT: addi sp, sp, 16 -; CHECK-ZHINXMIN-NEXT: ret -; CHECKFSGNJ-LABEL: fsgnjn_s: -; CHECKFSGNJ: # %bb.0: -; CHECKFSGNJ-NEXT: addi sp, sp, -16 -; CHECKFSGNJ-NEXT: fcvt.s.h ft0, fa1 -; CHECKFSGNJ-NEXT: fcvt.s.h ft1, fa0 -; CHECKFSGNJ-NEXT: fadd.s ft0, ft1, ft0 -; CHECKFSGNJ-NEXT: fcvt.h.s ft0, ft0 -; CHECKFSGNJ-NEXT: fcvt.s.h ft0, ft0 -; CHECKFSGNJ-NEXT: fneg.s ft0, ft0 -; CHECKFSGNJ-NEXT: fcvt.h.s ft0, ft0 -; CHECKFSGNJ-NEXT: fsh fa0, 8(sp) -; CHECKFSGNJ-NEXT: fsh ft0, 12(sp) -; CHECKFSGNJ-NEXT: lbu a0, 9(sp) -; CHECKFSGNJ-NEXT: lbu a1, 13(sp) -; CHECKFSGNJ-NEXT: andi a0, a0, 127 -; CHECKFSGNJ-NEXT: andi a1, a1, 128 -; CHECKFSGNJ-NEXT: or a0, a0, a1 -; CHECKFSGNJ-NEXT: sb a0, 9(sp) -; CHECKFSGNJ-NEXT: flh fa0, 8(sp) -; CHECKFSGNJ-NEXT: addi sp, sp, 16 -; CHECKFSGNJ-NEXT: ret -; CHECK64FSGNJ-LABEL: fsgnjn_s: -; CHECK64FSGNJ: # %bb.0: -; CHECK64FSGNJ-NEXT: addi sp, sp, -16 -; CHECK64FSGNJ-NEXT: fcvt.s.h ft0, fa1 -; CHECK64FSGNJ-NEXT: fcvt.s.h ft1, fa0 -; CHECK64FSGNJ-NEXT: fadd.s ft0, ft1, ft0 -; CHECK64FSGNJ-NEXT: fcvt.h.s ft0, ft0 -; CHECK64FSGNJ-NEXT: fcvt.s.h ft0, ft0 -; CHECK64FSGNJ-NEXT: fneg.s ft0, ft0 -; CHECK64FSGNJ-NEXT: fcvt.h.s ft0, ft0 -; CHECK64FSGNJ-NEXT: fsh fa0, 0(sp) -; CHECK64FSGNJ-NEXT: fsh ft0, 8(sp) -; CHECK64FSGNJ-NEXT: lbu a0, 1(sp) -; CHECK64FSGNJ-NEXT: lbu a1, 9(sp) -; CHECK64FSGNJ-NEXT: andi a0, a0, 127 -; CHECK64FSGNJ-NEXT: andi a1, a1, 128 -; CHECK64FSGNJ-NEXT: or a0, a0, a1 -; CHECK64FSGNJ-NEXT: sb a0, 1(sp) -; CHECK64FSGNJ-NEXT: flh fa0, 0(sp) -; CHECK64FSGNJ-NEXT: addi sp, sp, 16 -; CHECK64FSGNJ-NEXT: ret +; RV32IZFHMIN-LABEL: fsgnjn_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 4(sp) +; RV32IZFHMIN-NEXT: lbu a0, 5(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 5(sp) +; RV32IZFHMIN-NEXT: flh fa5, 4(sp) +; RV32IZFHMIN-NEXT: fsh fa0, 8(sp) +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 9(sp) +; RV32IZFHMIN-NEXT: lbu a1, 13(sp) +; RV32IZFHMIN-NEXT: andi a0, a0, 127 +; RV32IZFHMIN-NEXT: andi a1, a1, 128 +; RV32IZFHMIN-NEXT: or a0, a0, a1 +; RV32IZFHMIN-NEXT: sb a0, 9(sp) +; RV32IZFHMIN-NEXT: flh fa0, 8(sp) +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fsgnjn_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -32 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fsh fa0, 16(sp) +; RV64IZFHMIN-NEXT: fsh fa5, 24(sp) +; RV64IZFHMIN-NEXT: lbu a0, 17(sp) +; RV64IZFHMIN-NEXT: lbu a1, 25(sp) +; RV64IZFHMIN-NEXT: andi a0, a0, 127 +; RV64IZFHMIN-NEXT: andi a1, a1, 128 +; RV64IZFHMIN-NEXT: or a0, a0, a1 +; RV64IZFHMIN-NEXT: sb a0, 17(sp) +; RV64IZFHMIN-NEXT: flh fa0, 16(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, 32 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fsgnjn_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a0 +; RV32IZHINXMIN-NEXT: fadd.s a1, a2, a1 +; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: sh a1, 4(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 5(sp) +; RV32IZHINXMIN-NEXT: xori a1, a1, 128 +; RV32IZHINXMIN-NEXT: sb a1, 5(sp) +; RV32IZHINXMIN-NEXT: lh a1, 4(sp) +; RV32IZHINXMIN-NEXT: sh a0, 8(sp) +; RV32IZHINXMIN-NEXT: sh a1, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 13(sp) +; RV32IZHINXMIN-NEXT: andi a0, a0, 127 +; RV32IZHINXMIN-NEXT: andi a1, a1, 128 +; RV32IZHINXMIN-NEXT: or a0, a0, a1 +; RV32IZHINXMIN-NEXT: sb a0, 9(sp) +; RV32IZHINXMIN-NEXT: lh a0, 8(sp) +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fsgnjn_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -32 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a0 +; RV64IZHINXMIN-NEXT: fadd.s a1, a2, a1 +; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV64IZHINXMIN-NEXT: sh a1, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV64IZHINXMIN-NEXT: xori a1, a1, 128 +; RV64IZHINXMIN-NEXT: sb a1, 9(sp) +; RV64IZHINXMIN-NEXT: lh a1, 8(sp) +; RV64IZHINXMIN-NEXT: sh a0, 16(sp) +; RV64IZHINXMIN-NEXT: sh a1, 24(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 17(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 25(sp) +; RV64IZHINXMIN-NEXT: andi a0, a0, 127 +; RV64IZHINXMIN-NEXT: andi a1, a1, 128 +; RV64IZHINXMIN-NEXT: or a0, a0, a1 +; RV64IZHINXMIN-NEXT: sb a0, 17(sp) +; RV64IZHINXMIN-NEXT: lh a0, 16(sp) +; RV64IZHINXMIN-NEXT: addi sp, sp, 32 +; RV64IZHINXMIN-NEXT: ret %1 = fadd half %a, %b %2 = fneg half %1 %3 = call half @llvm.copysign.f16(half %a, half %2) @@ -904,12 +862,12 @@ define half @fabs_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fadd.h fa0, fa4, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fabs_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, a1 -; CHECK-ZHINX-NEXT: fabs.h a1, a0 -; CHECK-ZHINX-NEXT: fadd.h a0, a1, a0 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fabs_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, a1 +; CHECKIZHINX-NEXT: fabs.h a1, a0 +; CHECKIZHINX-NEXT: fadd.h a0, a1, a0 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fabs_s: ; RV32I: # %bb.0: @@ -985,56 +943,81 @@ define half @fabs_s(half %a, half %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fabs_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa0 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: andi a0, a0, 127 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa4, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fabs_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa0 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 127 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa4, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fabs_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fabs.s a1, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a1, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fabs_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: andi a0, a0, 127 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa4, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fabs_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: andi a0, a0, 127 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa4, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fabs_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fadd.s a0, a0, a1 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: sh a0, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 13(sp) +; RV32IZHINXMIN-NEXT: andi a1, a1, 127 +; RV32IZHINXMIN-NEXT: sb a1, 13(sp) +; RV32IZHINXMIN-NEXT: lh a1, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fadd.s a0, a1, a0 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fabs_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fadd.s a0, a0, a1 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: sh a0, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV64IZHINXMIN-NEXT: andi a1, a1, 127 +; RV64IZHINXMIN-NEXT: sb a1, 9(sp) +; RV64IZHINXMIN-NEXT: lh a1, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fadd.s a0, a1, a0 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %1 = fadd half %a, %b %2 = call half @llvm.fabs.f16(half %1) %3 = fadd half %2, %1 @@ -1049,10 +1032,10 @@ define half @fmin_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fmin.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmin_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmin.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmin_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmin.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmin_s: ; RV32I: # %bb.0: @@ -1114,20 +1097,13 @@ define half @fmin_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmin_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmin.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmin_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmin.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmin_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmin.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = call half @llvm.minnum.f16(half %a, half %b) ret half %1 } @@ -1140,10 +1116,10 @@ define half @fmax_s(half %a, half %b) nounwind { ; CHECKIZFH-NEXT: fmax.h fa0, fa0, fa1 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmax_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmax.h a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmax_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmax.h a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmax_s: ; RV32I: # %bb.0: @@ -1205,20 +1181,13 @@ define half @fmax_s(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmax_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmax.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmax_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmax.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmax_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmax.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = call half @llvm.maxnum.f16(half %a, half %b) ret half %1 } @@ -1231,10 +1200,10 @@ define half @fmadd_s(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fmadd.h fa0, fa0, fa1, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmadd_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmadd_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmadd_s: ; RV32I: # %bb.0: @@ -1311,22 +1280,14 @@ define half @fmadd_s(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmadd_s: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmadd_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmadd_s: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = call half @llvm.fma.f16(half %a, half %b, half %c) ret half %1 } @@ -1339,11 +1300,11 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fmsub.h fa0, fa0, fa1, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmsub_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a2, a2, zero -; CHECK-ZHINX-NEXT: fmsub.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmsub_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a2, a2, zero +; CHECKIZHINX-NEXT: fmsub.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmsub_s: ; RV32I: # %bb.0: @@ -1433,59 +1394,83 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fmsub_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fmsub_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmsub_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fmsub_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa5, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fmsub_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fmsub_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV32IZHINXMIN-NEXT: sh a2, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a2, 13(sp) +; RV32IZHINXMIN-NEXT: xori a2, a2, 128 +; RV32IZHINXMIN-NEXT: sb a2, 13(sp) +; RV32IZHINXMIN-NEXT: lh a2, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fmsub_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV64IZHINXMIN-NEXT: sh a2, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a2, 9(sp) +; RV64IZHINXMIN-NEXT: xori a2, a2, 128 +; RV64IZHINXMIN-NEXT: sb a2, 9(sp) +; RV64IZHINXMIN-NEXT: lh a2, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %c_ = fadd half 0.0, %c ; avoid negation using xor %negc = fsub half -0.0, %c_ %1 = call half @llvm.fma.f16(half %a, half %b, half %negc) @@ -1501,12 +1486,12 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmadd.h fa0, fa4, fa1, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmadd_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, zero -; CHECK-ZHINX-NEXT: fadd.h a2, a2, zero -; CHECK-ZHINX-NEXT: fnmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmadd_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, zero +; CHECKIZHINX-NEXT: fadd.h a2, a2, zero +; CHECKIZHINX-NEXT: fnmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmadd_s: ; RV32I: # %bb.0: @@ -1624,81 +1609,115 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmadd_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa4, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa4, fa3, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmadd_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa4, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa4, fa3, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmadd_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmadd_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV32IZFHMIN-NEXT: lbu a0, 9(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 9(sp) +; RV32IZFHMIN-NEXT: flh fa4, 8(sp) +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa5, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmadd_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 0(sp) +; RV64IZFHMIN-NEXT: lbu a0, 1(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 1(sp) +; RV64IZFHMIN-NEXT: flh fa4, 0(sp) +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fnmadd_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: sh a0, 8(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV32IZHINXMIN-NEXT: xori a0, a0, 128 +; RV32IZHINXMIN-NEXT: sb a0, 9(sp) +; RV32IZHINXMIN-NEXT: lh a0, 8(sp) +; RV32IZHINXMIN-NEXT: sh a2, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a2, 13(sp) +; RV32IZHINXMIN-NEXT: xori a2, a2, 128 +; RV32IZHINXMIN-NEXT: sb a2, 13(sp) +; RV32IZHINXMIN-NEXT: lh a2, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fnmadd_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: sh a0, 0(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 1(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV64IZHINXMIN-NEXT: xori a0, a0, 128 +; RV64IZHINXMIN-NEXT: sb a0, 1(sp) +; RV64IZHINXMIN-NEXT: lh a0, 0(sp) +; RV64IZHINXMIN-NEXT: sh a2, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a2, 9(sp) +; RV64IZHINXMIN-NEXT: xori a2, a2, 128 +; RV64IZHINXMIN-NEXT: sb a2, 9(sp) +; RV64IZHINXMIN-NEXT: lh a2, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %a_ = fadd half 0.0, %a %c_ = fadd half 0.0, %c %nega = fsub half -0.0, %a_ @@ -1716,12 +1735,12 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmadd.h fa0, fa4, fa0, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmadd_s_2: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a1, a1, zero -; CHECK-ZHINX-NEXT: fadd.h a2, a2, zero -; CHECK-ZHINX-NEXT: fnmadd.h a0, a1, a0, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmadd_s_2: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a1, a1, zero +; CHECKIZHINX-NEXT: fadd.h a2, a2, zero +; CHECKIZHINX-NEXT: fnmadd.h a0, a1, a0, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmadd_s_2: ; RV32I: # %bb.0: @@ -1839,81 +1858,115 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_2: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa4, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_2: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa4, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa4 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmadd_s_2: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fneg.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmadd_s_2: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV32IZFHMIN-NEXT: lbu a0, 9(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 9(sp) +; RV32IZFHMIN-NEXT: flh fa4, 8(sp) +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa5, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmadd_s_2: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 0(sp) +; RV64IZFHMIN-NEXT: lbu a0, 1(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 1(sp) +; RV64IZFHMIN-NEXT: flh fa4, 0(sp) +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa4 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fnmadd_s_2: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: sh a1, 8(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV32IZHINXMIN-NEXT: xori a1, a1, 128 +; RV32IZHINXMIN-NEXT: sb a1, 9(sp) +; RV32IZHINXMIN-NEXT: lh a1, 8(sp) +; RV32IZHINXMIN-NEXT: sh a2, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a2, 13(sp) +; RV32IZHINXMIN-NEXT: xori a2, a2, 128 +; RV32IZHINXMIN-NEXT: sb a2, 13(sp) +; RV32IZHINXMIN-NEXT: lh a2, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fnmadd_s_2: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV64IZHINXMIN-NEXT: sh a1, 0(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 1(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fadd.s a2, a2, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a2, a2 +; RV64IZHINXMIN-NEXT: xori a1, a1, 128 +; RV64IZHINXMIN-NEXT: sb a1, 1(sp) +; RV64IZHINXMIN-NEXT: lh a1, 0(sp) +; RV64IZHINXMIN-NEXT: sh a2, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a2, 9(sp) +; RV64IZHINXMIN-NEXT: xori a2, a2, 128 +; RV64IZHINXMIN-NEXT: sb a2, 9(sp) +; RV64IZHINXMIN-NEXT: lh a2, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %b_ = fadd half 0.0, %b %c_ = fadd half 0.0, %c %negb = fsub half -0.0, %b_ @@ -1941,12 +1994,12 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fneg.h fa0, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmadd_s_3: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: lui a1, 1048568 -; CHECK-ZHINX-NEXT: xor a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmadd_s_3: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: lui a1, 1048568 +; CHECKIZHINX-NEXT: xor a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmadd_s_3: ; RV32I: # %bb.0: @@ -2018,58 +2071,48 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_3: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa0, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_3: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa0, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; -; CHECKZHINXMIN-LABEL: fnmadd_s_3: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: lui a1, 1048568 -; CHECKZHINXMIN-NEXT: xor a0, a0, a1 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmadd_s_3: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: lui a1, 1048568 -; CHECK-ZHINXMIN-NEXT: xor a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmadd_s_3: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa0, 12(sp) +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmadd_s_3: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa0, 8(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; CHECKIZHINXMIN-LABEL: fnmadd_s_3: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: lui a1, 1048568 +; CHECKIZHINXMIN-NEXT: xor a0, a0, a1 +; CHECKIZHINXMIN-NEXT: ret %1 = call half @llvm.fma.f16(half %a, half %b, half %c) %neg = fneg half %1 ret half %neg @@ -2092,12 +2135,12 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmadd.h fa0, fa0, fa1, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmadd_nsz: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: lui a1, 1048568 -; CHECK-ZHINX-NEXT: xor a0, a0, a1 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmadd_nsz: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: lui a1, 1048568 +; CHECKIZHINX-NEXT: xor a0, a0, a1 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmadd_nsz: ; RV32I: # %bb.0: @@ -2169,58 +2212,48 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmadd_nsz: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa0, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmadd_nsz: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa0, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; -; CHECKZHINXMIN-LABEL: fnmadd_nsz: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: lui a1, 1048568 -; CHECKZHINXMIN-NEXT: xor a0, a0, a1 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmadd_nsz: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: lui a1, 1048568 -; CHECK-ZHINXMIN-NEXT: xor a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmadd_nsz: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa0, 12(sp) +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmadd_nsz: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa0, 8(sp) +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; CHECKIZHINXMIN-LABEL: fnmadd_nsz: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: lui a1, 1048568 +; CHECKIZHINXMIN-NEXT: xor a0, a0, a1 +; CHECKIZHINXMIN-NEXT: ret %1 = call nsz half @llvm.fma.f16(half %a, half %b, half %c) %neg = fneg nsz half %1 ret half %neg @@ -2234,11 +2267,11 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmsub.h fa0, fa5, fa1, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmsub_s: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, zero -; CHECK-ZHINX-NEXT: fnmsub.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmsub_s: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, zero +; CHECKIZHINX-NEXT: fnmsub.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmsub_s: ; RV32I: # %bb.0: @@ -2326,59 +2359,83 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmsub_s: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa2 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmsub_s: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa2 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa5, fa3, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmsub_s: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmsub_s: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa5, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa2 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmsub_s: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa2 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fnmsub_s: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: sh a0, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 13(sp) +; RV32IZHINXMIN-NEXT: xori a0, a0, 128 +; RV32IZHINXMIN-NEXT: sb a0, 13(sp) +; RV32IZHINXMIN-NEXT: lh a0, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fnmsub_s: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: sh a0, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV64IZHINXMIN-NEXT: xori a0, a0, 128 +; RV64IZHINXMIN-NEXT: sb a0, 9(sp) +; RV64IZHINXMIN-NEXT: lh a0, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %a_ = fadd half 0.0, %a %nega = fsub half -0.0, %a_ %1 = call half @llvm.fma.f16(half %nega, half %b, half %c) @@ -2393,11 +2450,11 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmsub.h fa0, fa5, fa0, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmsub_s_2: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a1, a1, zero -; CHECK-ZHINX-NEXT: fnmsub.h a0, a1, a0, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmsub_s_2: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a1, a1, zero +; CHECKIZHINX-NEXT: fnmsub.h a0, a1, a0, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmsub_s_2: ; RV32I: # %bb.0: @@ -2487,59 +2544,83 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmsub_s_2: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa2 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmsub_s_2: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa1 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa2 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa0 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fmadd.s fa5, fa3, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmsub_s_2: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fneg.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmsub_s_2: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa5, 12(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa2 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmsub_s_2: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa5, 8(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa2 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fnmsub_s_2: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: sh a1, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a1, 13(sp) +; RV32IZHINXMIN-NEXT: xori a1, a1, 128 +; RV32IZHINXMIN-NEXT: sb a1, 13(sp) +; RV32IZHINXMIN-NEXT: lh a1, 12(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fnmsub_s_2: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV64IZHINXMIN-NEXT: sh a1, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a1, 9(sp) +; RV64IZHINXMIN-NEXT: xori a1, a1, 128 +; RV64IZHINXMIN-NEXT: sb a1, 9(sp) +; RV64IZHINXMIN-NEXT: lh a1, 8(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a2 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %b_ = fadd half 0.0, %b %negb = fsub half -0.0, %b_ %1 = call half @llvm.fma.f16(half %a, half %negb, half %c) @@ -2552,10 +2633,10 @@ define half @fmadd_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fmadd.h fa0, fa0, fa1, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmadd_s_contract: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmadd_s_contract: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmadd_s_contract: ; RV32I: # %bb.0: @@ -2645,28 +2726,17 @@ define half @fmadd_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmadd_s_contract: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmadd_s_contract: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmadd_s_contract: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a2 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %1 = fmul contract half %a, %b %2 = fadd contract half %1, %c ret half %2 @@ -2680,11 +2750,11 @@ define half @fmsub_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fmsub.h fa0, fa0, fa1, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fmsub_s_contract: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a2, a2, zero -; CHECK-ZHINX-NEXT: fmsub.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fmsub_s_contract: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a2, a2, zero +; CHECKIZHINX-NEXT: fmsub.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fmsub_s_contract: ; RV32I: # %bb.0: @@ -2790,34 +2860,20 @@ define half @fmsub_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fmsub_s_contract: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fmsub_s_contract: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fmsub_s_contract: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fsub.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %c_ = fadd half 0.0, %c ; avoid negation using xor %1 = fmul contract half %a, %b %2 = fsub contract half %1, %c_ @@ -2834,13 +2890,13 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmadd.h fa0, fa4, fa3, fa5 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmadd_s_contract: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, zero -; CHECK-ZHINX-NEXT: fadd.h a1, a1, zero -; CHECK-ZHINX-NEXT: fadd.h a2, a2, zero -; CHECK-ZHINX-NEXT: fnmadd.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmadd_s_contract: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, zero +; CHECKIZHINX-NEXT: fadd.h a1, a1, zero +; CHECKIZHINX-NEXT: fadd.h a2, a2, zero +; CHECKIZHINX-NEXT: fnmadd.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmadd_s_contract: ; RV32I: # %bb.0: @@ -2964,86 +3020,119 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_contract: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV32-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa3, fa3, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa3, fa3 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa3, fa3 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fmul.s fa5, fa5, fa3 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV32-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa3, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa3 -; CHECK-RV32-FSGNJ-NEXT: fsub.s fa5, fa4, fa5 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_contract: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa0 -; CHECK-RV64-FSGNJ-NEXT: fmv.w.x fa4, zero -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa1 -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa3, fa3, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa3, fa3 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa3, fa3 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fmul.s fa5, fa5, fa3 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa2 -; CHECK-RV64-FSGNJ-NEXT: xori a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa3, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa3 -; CHECK-RV64-FSGNJ-NEXT: fsub.s fa5, fa4, fa5 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmadd_s_contract: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fneg.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECK-ZHINXMIN-NEXT: fsub.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; RV32IZFHMIN-LABEL: fnmadd_s_contract: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV32IZFHMIN-NEXT: fadd.s fa3, fa3, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa3, fa3 +; RV32IZFHMIN-NEXT: fcvt.s.h fa3, fa3 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fmul.s fa5, fa5, fa3 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fsh fa5, 12(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IZFHMIN-NEXT: xori a0, a0, 128 +; RV32IZFHMIN-NEXT: sb a0, 13(sp) +; RV32IZFHMIN-NEXT: flh fa3, 12(sp) +; RV32IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa3 +; RV32IZFHMIN-NEXT: fsub.s fa5, fa4, fa5 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fnmadd_s_contract: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFHMIN-NEXT: fmv.w.x fa4, zero +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; RV64IZFHMIN-NEXT: fadd.s fa3, fa3, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa3, fa3 +; RV64IZFHMIN-NEXT: fcvt.s.h fa3, fa3 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fmul.s fa5, fa5, fa3 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IZFHMIN-NEXT: xori a0, a0, 128 +; RV64IZFHMIN-NEXT: sb a0, 9(sp) +; RV64IZFHMIN-NEXT: flh fa3, 8(sp) +; RV64IZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa3 +; RV64IZFHMIN-NEXT: fsub.s fa5, fa4, fa5 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fnmadd_s_contract: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fmul.s a0, a0, a1 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: sh a0, 12(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 13(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a2 +; RV32IZHINXMIN-NEXT: xori a0, a0, 128 +; RV32IZHINXMIN-NEXT: sb a0, 13(sp) +; RV32IZHINXMIN-NEXT: lh a0, 12(sp) +; RV32IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fsub.s a0, a0, a1 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fnmadd_s_contract: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fadd.s a0, a0, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fmul.s a0, a0, a1 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: sh a0, 8(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a2 +; RV64IZHINXMIN-NEXT: xori a0, a0, 128 +; RV64IZHINXMIN-NEXT: sb a0, 9(sp) +; RV64IZHINXMIN-NEXT: lh a0, 8(sp) +; RV64IZHINXMIN-NEXT: fadd.s a1, a1, zero +; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fsub.s a0, a0, a1 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %a_ = fadd half 0.0, %a ; avoid negation using xor %b_ = fadd half 0.0, %b ; avoid negation using xor %c_ = fadd half 0.0, %c ; avoid negation using xor @@ -3062,12 +3151,12 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFH-NEXT: fnmsub.h fa0, fa4, fa5, fa2 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fnmsub_s_contract: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fadd.h a0, a0, zero -; CHECK-ZHINX-NEXT: fadd.h a1, a1, zero -; CHECK-ZHINX-NEXT: fnmsub.h a0, a0, a1, a2 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fnmsub_s_contract: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fadd.h a0, a0, zero +; CHECKIZHINX-NEXT: fadd.h a1, a1, zero +; CHECKIZHINX-NEXT: fnmsub.h a0, a0, a1, a2 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fnmsub_s_contract: ; RV32I: # %bb.0: @@ -3190,40 +3279,23 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; -; CHECKZHINXMIN-LABEL: fnmsub_s_contract: -; CHECKZHINXMIN: # %bb.0: -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECKZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECKZHINXMIN-NEXT: fsub.s a0, a1, a0 -; CHECKZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKZHINXMIN-NEXT: ret -; CHECK-ZHINXMIN-LABEL: fnmsub_s_contract: -; CHECK-ZHINXMIN: # %bb.0: -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a2 -; CHECK-ZHINXMIN-NEXT: fsub.s a0, a1, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: ret +; CHECKIZHINXMIN-LABEL: fnmsub_s_contract: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a2 +; CHECKIZHINXMIN-NEXT: fsub.s a0, a1, a0 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret %a_ = fadd half 0.0, %a ; avoid negation using xor %b_ = fadd half 0.0, %b ; avoid negation using xor %1 = fmul contract half %a_, %b_ @@ -3237,10 +3309,10 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; CHECKIZFH-NEXT: fsgnjx.h fa0, fa1, fa0 ; CHECKIZFH-NEXT: ret ; -; CHECK-ZHINX-LABEL: fsgnjx_f16: -; CHECK-ZHINX: # %bb.0: -; CHECK-ZHINX-NEXT: fsgnjx.h a0, a1, a0 -; CHECK-ZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fsgnjx_f16: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: fsgnjx.h a0, a1, a0 +; CHECKIZHINX-NEXT: ret ; ; RV32I-LABEL: fsgnjx_f16: ; RV32I: # %bb.0: @@ -3294,47 +3366,89 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; CHECK-RV32-FSGNJ-LABEL: fsgnjx_f16: -; CHECK-RV32-FSGNJ: # %bb.0: -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV32-FSGNJ-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, %lo(.LCPI23_0)(a0) -; CHECK-RV32-FSGNJ-NEXT: fsh fa0, 12(sp) -; CHECK-RV32-FSGNJ-NEXT: fsh fa5, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a0, 13(sp) -; CHECK-RV32-FSGNJ-NEXT: lbu a1, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: andi a0, a0, 128 -; CHECK-RV32-FSGNJ-NEXT: andi a1, a1, 127 -; CHECK-RV32-FSGNJ-NEXT: or a0, a1, a0 -; CHECK-RV32-FSGNJ-NEXT: sb a0, 9(sp) -; CHECK-RV32-FSGNJ-NEXT: flh fa5, 8(sp) -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV32-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV32-FSGNJ-NEXT: fmul.s fa5, fa5, fa4 -; CHECK-RV32-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV32-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV32-FSGNJ-NEXT: ret -; -; CHECK-RV64-FSGNJ-LABEL: fsgnjx_f16: -; CHECK-RV64-FSGNJ: # %bb.0: -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, -16 -; CHECK-RV64-FSGNJ-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, %lo(.LCPI23_0)(a0) -; CHECK-RV64-FSGNJ-NEXT: fsh fa0, 8(sp) -; CHECK-RV64-FSGNJ-NEXT: fsh fa5, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a0, 9(sp) -; CHECK-RV64-FSGNJ-NEXT: lbu a1, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: andi a0, a0, 128 -; CHECK-RV64-FSGNJ-NEXT: andi a1, a1, 127 -; CHECK-RV64-FSGNJ-NEXT: or a0, a1, a0 -; CHECK-RV64-FSGNJ-NEXT: sb a0, 1(sp) -; CHECK-RV64-FSGNJ-NEXT: flh fa5, 0(sp) -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa4, fa1 -; CHECK-RV64-FSGNJ-NEXT: fcvt.s.h fa5, fa5 -; CHECK-RV64-FSGNJ-NEXT: fmul.s fa5, fa5, fa4 -; CHECK-RV64-FSGNJ-NEXT: fcvt.h.s fa0, fa5 -; CHECK-RV64-FSGNJ-NEXT: addi sp, sp, 16 -; CHECK-RV64-FSGNJ-NEXT: ret +; RV32IZFHMIN-LABEL: fsgnjx_f16: +; RV32IZFHMIN: # %bb.0: +; RV32IZFHMIN-NEXT: addi sp, sp, -16 +; RV32IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) +; RV32IZFHMIN-NEXT: flh fa5, %lo(.LCPI23_0)(a0) +; RV32IZFHMIN-NEXT: fsh fa0, 12(sp) +; RV32IZFHMIN-NEXT: fsh fa5, 8(sp) +; RV32IZFHMIN-NEXT: lbu a0, 13(sp) +; RV32IZFHMIN-NEXT: lbu a1, 9(sp) +; RV32IZFHMIN-NEXT: andi a0, a0, 128 +; RV32IZFHMIN-NEXT: andi a1, a1, 127 +; RV32IZFHMIN-NEXT: or a0, a1, a0 +; RV32IZFHMIN-NEXT: sb a0, 9(sp) +; RV32IZFHMIN-NEXT: flh fa5, 8(sp) +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV32IZFHMIN-NEXT: fmul.s fa5, fa5, fa4 +; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IZFHMIN-NEXT: addi sp, sp, 16 +; RV32IZFHMIN-NEXT: ret +; +; RV64IZFHMIN-LABEL: fsgnjx_f16: +; RV64IZFHMIN: # %bb.0: +; RV64IZFHMIN-NEXT: addi sp, sp, -16 +; RV64IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) +; RV64IZFHMIN-NEXT: flh fa5, %lo(.LCPI23_0)(a0) +; RV64IZFHMIN-NEXT: fsh fa0, 8(sp) +; RV64IZFHMIN-NEXT: fsh fa5, 0(sp) +; RV64IZFHMIN-NEXT: lbu a0, 9(sp) +; RV64IZFHMIN-NEXT: lbu a1, 1(sp) +; RV64IZFHMIN-NEXT: andi a0, a0, 128 +; RV64IZFHMIN-NEXT: andi a1, a1, 127 +; RV64IZFHMIN-NEXT: or a0, a1, a0 +; RV64IZFHMIN-NEXT: sb a0, 1(sp) +; RV64IZFHMIN-NEXT: flh fa5, 0(sp) +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; RV64IZFHMIN-NEXT: fmul.s fa5, fa5, fa4 +; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IZFHMIN-NEXT: addi sp, sp, 16 +; RV64IZFHMIN-NEXT: ret +; +; RV32IZHINXMIN-LABEL: fsgnjx_f16: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: addi sp, sp, -16 +; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI23_0) +; RV32IZHINXMIN-NEXT: lh a2, %lo(.LCPI23_0)(a2) +; RV32IZHINXMIN-NEXT: sh a0, 12(sp) +; RV32IZHINXMIN-NEXT: sh a2, 8(sp) +; RV32IZHINXMIN-NEXT: lbu a0, 13(sp) +; RV32IZHINXMIN-NEXT: lbu a2, 9(sp) +; RV32IZHINXMIN-NEXT: andi a0, a0, 128 +; RV32IZHINXMIN-NEXT: andi a2, a2, 127 +; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: sb a0, 9(sp) +; RV32IZHINXMIN-NEXT: lh a0, 8(sp) +; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: fmul.s a0, a0, a1 +; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: addi sp, sp, 16 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: fsgnjx_f16: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: addi sp, sp, -16 +; RV64IZHINXMIN-NEXT: lui a2, %hi(.LCPI23_0) +; RV64IZHINXMIN-NEXT: lh a2, %lo(.LCPI23_0)(a2) +; RV64IZHINXMIN-NEXT: sh a0, 8(sp) +; RV64IZHINXMIN-NEXT: sh a2, 0(sp) +; RV64IZHINXMIN-NEXT: lbu a0, 9(sp) +; RV64IZHINXMIN-NEXT: lbu a2, 1(sp) +; RV64IZHINXMIN-NEXT: andi a0, a0, 128 +; RV64IZHINXMIN-NEXT: andi a2, a2, 127 +; RV64IZHINXMIN-NEXT: or a0, a2, a0 +; RV64IZHINXMIN-NEXT: sb a0, 1(sp) +; RV64IZHINXMIN-NEXT: lh a0, 0(sp) +; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: fmul.s a0, a0, a1 +; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: addi sp, sp, 16 +; RV64IZHINXMIN-NEXT: ret %z = call half @llvm.copysign.f16(half 1.0, half %x) %mul = fmul half %z, %y ret half %mul diff --git a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s deleted file mode 100644 index e3029c16ffc8a6..00000000000000 --- a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s +++ /dev/null @@ -1,8 +0,0 @@ -// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s - -// Check that setting +nosve implies +nosve2 -.arch armv9-a+nosve - -adclb z0.s, z1.s, z31.s -// CHECK: error: instruction requires: sve2 -// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s index 31118f7490d00d..661f13974d0bc8 100644 --- a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s @@ -1,12 +1,7 @@ // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s -.arch_extension sve2+nosve +.arch_extension nosve ptrue p0.b, pow2 // CHECK: error: instruction requires: sve or sme // CHECK-NEXT: ptrue p0.b, pow2 - -// Check that setting +nosve implies +nosve2 -adclb z0.s, z1.s, z31.s -// CHECK: error: instruction requires: sve2 -// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s index 6ba537ca70609e..82acc1b0b0be9b 100644 --- a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s +++ b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s @@ -1,11 +1,6 @@ // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s -.cpu generic+sve2+nosve +.cpu generic+sve+nosve ptrue p0.b, pow2 // CHECK: error: instruction requires: sve or sme // CHECK-NEXT: ptrue p0.b, pow2 - -// Check that setting +nosve implies +nosve2 -adclb z0.s, z1.s, z31.s -// CHECK: error: instruction requires: sve2 -// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/directive-arch-negative.s b/llvm/test/MC/AArch64/directive-arch-negative.s index 406507d5fc8f4d..f60759899aa6c9 100644 --- a/llvm/test/MC/AArch64/directive-arch-negative.s +++ b/llvm/test/MC/AArch64/directive-arch-negative.s @@ -12,13 +12,10 @@ # CHECK-NEXT: aese v0.8h, v1.8h # CHECK-NEXT: ^ +// We silently ignore invalid features. .arch armv8+foo aese v0.8h, v1.8h -# CHECK: error: unsupported architectural extension: foo -# CHECK-NEXT: .arch armv8+foo -# CHECK-NEXT: ^ - # CHECK: error: invalid operand for instruction # CHECK-NEXT: aese v0.8h, v1.8h # CHECK-NEXT: ^ diff --git a/llvm/test/MC/AArch64/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/directive-arch_extension-negative.s index 1843af56555461..1c1cfc9d33e3ed 100644 --- a/llvm/test/MC/AArch64/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/directive-arch_extension-negative.s @@ -4,7 +4,7 @@ // RUN: -filetype asm -o - %s 2>&1 | FileCheck %s .arch_extension axp64 -// CHECK: error: unsupported architectural extension: axp64 +// CHECK: error: unknown architectural extension: axp64 // CHECK-NEXT: .arch_extension axp64 crc32cx w0, w1, x3 @@ -49,8 +49,6 @@ fminnm d0, d0, d1 // CHECK: [[@LINE-1]]:1: error: instruction requires: fp // CHECK-NEXT: fminnm d0, d0, d1 -// nofp implied nosimd, so reinstate it -.arch_extension simd addp v0.4s, v0.4s, v0.4s // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: neon .arch_extension nosimd @@ -72,8 +70,6 @@ casa w5, w7, [x20] // CHECK: [[@LINE-1]]:1: error: instruction requires: lse // CHECK-NEXT: casa w5, w7, [x20] -// nolse implied nolse128, so reinstate it -.arch_extension lse128 swpp x0, x2, [x3] // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: lse128 .arch_extension nolse128 @@ -88,8 +84,6 @@ cfp rctx, x0 // CHECK: [[@LINE-1]]:5: error: CFPRCTX requires: predres // CHECK-NEXT: cfp rctx, x0 -// nopredres implied nopredres2, so reinstate it -.arch_extension predres2 cosp rctx, x0 // CHECK-NOT: [[@LINE-1]]:6: error: COSP requires: predres2 .arch_extension nopredres2 @@ -139,8 +133,6 @@ ldapr x0, [x1] // CHECK: [[@LINE-1]]:1: error: instruction requires: rcpc // CHECK-NEXT: ldapr x0, [x1] -// norcpc implied norcpc3, so reinstate it -.arch_extension rcpc3 stilp w24, w0, [x16, #-8]! // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: rcpc3 .arch_extension norcpc3 @@ -177,8 +169,6 @@ cpyfp [x0]!, [x1]!, x2! // CHECK: [[@LINE-1]]:1: error: instruction requires: mops // CHECK-NEXT: cpyfp [x0]!, [x1]!, x2! -// nolse128 implied nod128, so reinstate it -.arch_extension d128 // This needs to come before `.arch_extension nothe` as it uses an instruction // that requires both the and d128 sysp #0, c2, c0, #0, x0, x1 @@ -214,8 +204,6 @@ umax x0, x1, x2 // CHECK: [[@LINE-1]]:1: error: instruction requires: cssc // CHECK-NEXT: umax x0, x1, x2 -// noras implied norasv2, so reinstate it -.arch_extension rasv2 mrs x0, ERXGSR_EL1 // CHECK-NOT: [[@LINE-1]]:9: error: expected readable system register .arch_extension norasv2 diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 45335b348b7e8f..48aec4bc52a0c5 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -854,85 +854,85 @@ main: # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01] f16x8.replace_lane 1 - # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02] + # CHECK: f16x8.add # encoding: [0xfd,0xbd,0x02] f16x8.add - # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02] + # CHECK: f16x8.sub # encoding: [0xfd,0xbe,0x02] f16x8.sub - # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02] + # CHECK: f16x8.mul # encoding: [0xfd,0xbf,0x02] f16x8.mul - # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02] + # CHECK: f16x8.div # encoding: [0xfd,0xc0,0x02] f16x8.div - # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02] + # CHECK: f16x8.min # encoding: [0xfd,0xc1,0x02] f16x8.min - # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02] + # CHECK: f16x8.max # encoding: [0xfd,0xc2,0x02] f16x8.max - # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02] + # CHECK: f16x8.pmin # encoding: [0xfd,0xc3,0x02] f16x8.pmin - # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02] + # CHECK: f16x8.pmax # encoding: [0xfd,0xc4,0x02] f16x8.pmax - # CHECK: f16x8.eq # encoding: [0xfd,0xc0,0x02] + # CHECK: f16x8.eq # encoding: [0xfd,0xb7,0x02] f16x8.eq - # CHECK: f16x8.ne # encoding: [0xfd,0xc1,0x02] + # CHECK: f16x8.ne # encoding: [0xfd,0xb8,0x02] f16x8.ne - # CHECK: f16x8.lt # encoding: [0xfd,0xc2,0x02] + # CHECK: f16x8.lt # encoding: [0xfd,0xb9,0x02] f16x8.lt - # CHECK: f16x8.gt # encoding: [0xfd,0xc3,0x02] + # CHECK: f16x8.gt # encoding: [0xfd,0xba,0x02] f16x8.gt - # CHECK: f16x8.le # encoding: [0xfd,0xc4,0x02] + # CHECK: f16x8.le # encoding: [0xfd,0xbb,0x02] f16x8.le - # CHECK: f16x8.ge # encoding: [0xfd,0xc5,0x02] + # CHECK: f16x8.ge # encoding: [0xfd,0xbc,0x02] f16x8.ge - # CHECK: f16x8.abs # encoding: [0xfd,0xb1,0x02] + # CHECK: f16x8.abs # encoding: [0xfd,0xb0,0x02] f16x8.abs - # CHECK: f16x8.neg # encoding: [0xfd,0xb2,0x02] + # CHECK: f16x8.neg # encoding: [0xfd,0xb1,0x02] f16x8.neg - # CHECK: f16x8.sqrt # encoding: [0xfd,0xb3,0x02] + # CHECK: f16x8.sqrt # encoding: [0xfd,0xb2,0x02] f16x8.sqrt - # CHECK: f16x8.ceil # encoding: [0xfd,0xbc,0x02] + # CHECK: f16x8.ceil # encoding: [0xfd,0xb3,0x02] f16x8.ceil - # CHECK: f16x8.floor # encoding: [0xfd,0xbd,0x02] + # CHECK: f16x8.floor # encoding: [0xfd,0xb4,0x02] f16x8.floor - # CHECK: f16x8.trunc # encoding: [0xfd,0xbe,0x02] + # CHECK: f16x8.trunc # encoding: [0xfd,0xb5,0x02] f16x8.trunc - # CHECK: f16x8.nearest # encoding: [0xfd,0xbf,0x02] + # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02] f16x8.nearest - # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xc6,0x02] + # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02] f16x8.relaxed_madd - # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xc7,0x02] + # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02] f16x8.relaxed_nmadd - # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc8,0x02] + # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02] i16x8.trunc_sat_f16x8_s - # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc9,0x02] + # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc6,0x02] i16x8.trunc_sat_f16x8_u - # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xca,0x02] + # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xc7,0x02] f16x8.convert_i16x8_s - # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xcb,0x02] + # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02] f16x8.convert_i16x8_u end_function diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll new file mode 100644 index 00000000000000..42819d5421ca0f --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll @@ -0,0 +1,116 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a preceding call in the alloc context +;; does not cause missing or incorrect cloning. This test is otherwise the same +;; as memprof-basic.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,blah, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +source_filename = "memprof-aliased-location1.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() #0 { +entry: + %call = call ptr @_Z3foov(), !callsite !0 + %call1 = call ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +declare void @blah() + +define internal ptr @_Z3barv() #0 { +entry: + %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z3bazv() #0 { +entry: + ;; Preceding call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !8 + %call = call ptr @_Z3barv(), !callsite !8 + ret ptr null +} + +define internal ptr @_Z3foov() #0 { +entry: + %call = call ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +attributes #0 = { noinline optnone } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5} +!3 = !{!4, !"notcold", i64 100} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold", i64 400} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} + +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll new file mode 100644 index 00000000000000..663f8525043c2f --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll @@ -0,0 +1,116 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a subsequent call in the alloc context +;; does not cause missing or incorrect cloning. This test is otherwise the same +;; as memprof-basic.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,blah, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +source_filename = "memprof-aliased-location2.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() #0 { +entry: + %call = call ptr @_Z3foov(), !callsite !0 + %call1 = call ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +declare void @blah() + +define internal ptr @_Z3barv() #0 { +entry: + %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z3bazv() #0 { +entry: + %call = call ptr @_Z3barv(), !callsite !8 + ;; Subsequent call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !8 + ret ptr null +} + +define internal ptr @_Z3foov() #0 { +entry: + %call = call ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +attributes #0 = { noinline optnone } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5} +!3 = !{!4, !"notcold", i64 100} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold", i64 400} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} + +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll new file mode 100644 index 00000000000000..3f5dc7732dc5c3 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll @@ -0,0 +1,99 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a preceding tail call in the alloc +;; context does not cause missing or incorrect cloning. This test is otherwise +;; the same as memprof-tailcall.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,_Z3barv,plx \ +; RUN: -r=%t.o,_Z3bazv,plx \ +; RUN: -r=%t.o,_Z3foov,plx \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,blah, \ +; RUN: -stats -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls +; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls +; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls + +source_filename = "memprof-tailcall-aliased-location1.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline +; IR-LABEL: @_Z3barv() +define ptr @_Z3barv() local_unnamed_addr #0 { +entry: + ; IR: call {{.*}} @_Znam(i64 10) #[[NOTCOLD:[0-9]+]] + %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare ptr @_Znam(i64) #1 +declare void @blah() + +; Function Attrs: noinline +; IR-LABEL: @_Z3bazv() +define ptr @_Z3bazv() #0 { +entry: + ; IR: call ptr @_Z3barv() + %call = tail call ptr @_Z3barv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @_Z3foov() +define ptr @_Z3foov() #0 { +entry: + ; IR: call ptr @_Z3bazv() + %call = tail call ptr @_Z3bazv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @main() +define i32 @main() #0 { + ;; Preceding call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !6 + ;; The first call to foo is part of a cold context, and should use the + ;; original functions. + ; IR: call ptr @_Z3foov() + %call = tail call ptr @_Z3foov(), !callsite !6 + ;; The second call to foo is part of a cold context, and should call the + ;; cloned functions. + ; IR: call ptr @_Z3foov.memprof.1() + %call1 = tail call ptr @_Z3foov(), !callsite !7 + ret i32 0 +} + +; IR-LABEL: @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 10) #[[COLD:[0-9]+]] +; IR-LABEL: @_Z3bazv.memprof.1() +; IR: call ptr @_Z3barv.memprof.1() +; IR-LABEL: @_Z3foov.memprof.1() +; IR: call ptr @_Z3bazv.memprof.1() + +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + +attributes #0 = { noinline } +attributes #1 = { nobuiltin allocsize(0) } +attributes #2 = { builtin allocsize(0) } + +!0 = !{!1, !3} +!1 = !{!2, !"notcold"} +!2 = !{i64 3186456655321080972, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 3186456655321080972, i64 -3421689549917153178} +!5 = !{i64 3186456655321080972} +!6 = !{i64 8632435727821051414} +!7 = !{i64 -3421689549917153178} diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll new file mode 100644 index 00000000000000..3085b4e41938b2 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll @@ -0,0 +1,99 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a subsequent tail call in the alloc +;; context does not cause missing or incorrect cloning. This test is otherwise +;; the same as memprof-tailcall.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,_Z3barv,plx \ +; RUN: -r=%t.o,_Z3bazv,plx \ +; RUN: -r=%t.o,_Z3foov,plx \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,blah, \ +; RUN: -stats -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls +; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls +; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls + +source_filename = "memprof-tailcall-aliased-location2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline +; IR-LABEL: @_Z3barv() +define ptr @_Z3barv() local_unnamed_addr #0 { +entry: + ; IR: call {{.*}} @_Znam(i64 10) #[[NOTCOLD:[0-9]+]] + %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare ptr @_Znam(i64) #1 +declare void @blah() + +; Function Attrs: noinline +; IR-LABEL: @_Z3bazv() +define ptr @_Z3bazv() #0 { +entry: + ; IR: call ptr @_Z3barv() + %call = tail call ptr @_Z3barv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @_Z3foov() +define ptr @_Z3foov() #0 { +entry: + ; IR: call ptr @_Z3bazv() + %call = tail call ptr @_Z3bazv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @main() +define i32 @main() #0 { + ;; The first call to foo is part of a cold context, and should use the + ;; original functions. + ; IR: call ptr @_Z3foov() + %call = tail call ptr @_Z3foov(), !callsite !6 + ;; Subsequent call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !6 + ;; The second call to foo is part of a cold context, and should call the + ;; cloned functions. + ; IR: call ptr @_Z3foov.memprof.1() + %call1 = tail call ptr @_Z3foov(), !callsite !7 + ret i32 0 +} + +; IR-LABEL: @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 10) #[[COLD:[0-9]+]] +; IR-LABEL: @_Z3bazv.memprof.1() +; IR: call ptr @_Z3barv.memprof.1() +; IR-LABEL: @_Z3foov.memprof.1() +; IR: call ptr @_Z3bazv.memprof.1() + +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + +attributes #0 = { noinline } +attributes #1 = { nobuiltin allocsize(0) } +attributes #2 = { builtin allocsize(0) } + +!0 = !{!1, !3} +!1 = !{!2, !"notcold"} +!2 = !{i64 3186456655321080972, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 3186456655321080972, i64 -3421689549917153178} +!5 = !{i64 3186456655321080972} +!6 = !{i64 8632435727821051414} +!7 = !{i64 -3421689549917153178} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll new file mode 100644 index 00000000000000..8f9df20471e41c --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll @@ -0,0 +1,274 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a preceding call in the alloc context +;; does not cause missing or incorrect cloning. This test is otherwise the same +;; as basic.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() #0 { +entry: + %call = call noundef ptr @_Z3foov(), !callsite !0 + %call1 = call noundef ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nobuiltin +declare void @_ZdaPv() #2 + +define internal ptr @_Z3barv() #3 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) +declare void @blah() + +define internal ptr @_Z3bazv() #4 { +entry: + ;; Preceding call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !8 + %call = call noundef ptr @_Z3barv(), !callsite !8 + ret ptr null +} + +; Function Attrs: noinline +define internal ptr @_Z3foov() #5 { +entry: + %call = call noundef ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +attributes #0 = { "tune-cpu"="generic" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #2 = { nobuiltin } +attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #4 = { "stack-protector-buffer-size"="8" } +attributes #5 = { noinline } +attributes #6 = { builtin } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5} +!3 = !{!4, !"notcold", i64 100} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold", i64 400} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[BAR2:0x[a-z0-9]+]] + +; DUMP: Node [[BAZ]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[BAZ2:0x[a-z0-9]+]] + +; DUMP: Node [[FOO]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[FOO2:0x[a-z0-9]+]] + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[FOO2]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[FOO]] + +; DUMP: Node [[BAZ2]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[BAZ]] + +; DUMP: Node [[BAR2]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[BAR]] + + +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv +; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold + +; SIZES: NotCold context 1 with total size 100 is NotCold after cloning +; SIZES: Cold context 2 with total size 400 is Cold after cloning + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOT: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: } + + +; DOTCLONED: digraph "cloned" { +; DOTCLONED: label="cloned"; +; DOTCLONED: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOTCLONED: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOTCLONED: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOTCLONED: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOTCLONED: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOTCLONED: Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"]; +; DOTCLONED: Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"]; +; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOTCLONED: } diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll new file mode 100644 index 00000000000000..c3c164d4928632 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll @@ -0,0 +1,274 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a subsequent call in the alloc context +;; does not cause missing or incorrect cloning. This test is otherwise the same +;; as basic.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() #0 { +entry: + %call = call noundef ptr @_Z3foov(), !callsite !0 + %call1 = call noundef ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nobuiltin +declare void @_ZdaPv() #2 + +define internal ptr @_Z3barv() #3 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) +declare void @blah() + +define internal ptr @_Z3bazv() #4 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !8 + ;; Subsequent call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !8 + ret ptr null +} + +; Function Attrs: noinline +define internal ptr @_Z3foov() #5 { +entry: + %call = call noundef ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +attributes #0 = { "tune-cpu"="generic" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #2 = { nobuiltin } +attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #4 = { "stack-protector-buffer-size"="8" } +attributes #5 = { noinline } +attributes #6 = { builtin } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5} +!3 = !{!4, !"notcold", i64 100} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold", i64 400} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[BAR2:0x[a-z0-9]+]] + +; DUMP: Node [[BAZ]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[BAZ2:0x[a-z0-9]+]] + +; DUMP: Node [[FOO]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Clones: [[FOO2:0x[a-z0-9]+]] + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[FOO2]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[FOO]] + +; DUMP: Node [[BAZ2]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[BAZ]] + +; DUMP: Node [[BAR2]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Clone of [[BAR]] + + +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv +; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold + +; SIZES: NotCold context 1 with total size 100 is NotCold after cloning +; SIZES: Cold context 2 with total size 400 is Cold after cloning + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOT: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: } + + +; DOTCLONED: digraph "cloned" { +; DOTCLONED: label="cloned"; +; DOTCLONED: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOTCLONED: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOTCLONED: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOTCLONED: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOTCLONED: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOTCLONED: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOTCLONED: Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"]; +; DOTCLONED: Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"]; +; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; +; DOTCLONED: } diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll new file mode 100644 index 00000000000000..e0bcd284c097c2 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll @@ -0,0 +1,100 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a preceding tail call in the alloc +;; context does not cause missing or incorrect cloning. This test is otherwise +;; the same as tailcall.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -stats %s -S 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=IR + +source_filename = "tailcall-aliased-location1.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16 + +declare void @_Z2a1v() #0 + +declare void @_Z2a2v() #0 + +; Function Attrs: noinline +; IR-LABEL: @_Z3barv() +define ptr @_Z3barv() local_unnamed_addr #0 { +entry: + ; IR: call ptr @_Znam(i64 10) #[[NOTCOLD:[0-9]+]] + %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare ptr @_Znam(i64) #1 +declare void @blah() + +; Function Attrs: noinline +; IR-LABEL: @_Z3bazv() +define ptr @_Z3bazv() #0 { +entry: + ; IR: call ptr @_Z3barv() + %call = tail call ptr @_Z3barv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @_Z3foov() +define ptr @_Z3foov() #0 { +entry: + ; IR: call ptr @_Z3bazv() + %call = tail call ptr @_Z3bazv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @main() +define i32 @main() #0 { + ;; Preceding call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !6 + ;; The first call to foo is part of a cold context, and should use the + ;; original functions. + ;; allocation. The latter should call the cloned functions. + ; IR: call ptr @_Z3foov() + %call = tail call ptr @_Z3foov(), !callsite !6 + ;; The second call to foo is part of a cold context, and should call the + ;; cloned functions. + ; IR: call ptr @_Z3foov.memprof.1() + %call1 = tail call ptr @_Z3foov(), !callsite !7 + %2 = load ptr, ptr @a, align 16 + call void %2(), !callsite !10 + ret i32 0 +} + +; IR-LABEL: @_Z3barv.memprof.1() +; IR: call ptr @_Znam(i64 10) #[[COLD:[0-9]+]] +; IR-LABEL: @_Z3bazv.memprof.1() +; IR: call ptr @_Z3barv.memprof.1() +; IR-LABEL: @_Z3foov.memprof.1() +; IR: call ptr @_Z3bazv.memprof.1() + +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + +; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls +; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls +; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls + +attributes #0 = { noinline } +attributes #1 = { nobuiltin allocsize(0) } +attributes #2 = { builtin allocsize(0) } + +!0 = !{!1, !3, !8} +!1 = !{!2, !"notcold"} +!2 = !{i64 3186456655321080972, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 3186456655321080972, i64 -3421689549917153178} +!5 = !{i64 3186456655321080972} +!6 = !{i64 8632435727821051414} +!7 = !{i64 -3421689549917153178} +!8 = !{!9, !"notcold"} +!9 = !{i64 3186456655321080972, i64 1} +!10 = !{i64 1} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll new file mode 100644 index 00000000000000..1e76243fe0f48b --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll @@ -0,0 +1,100 @@ +;; Test to ensure a call to a different callee but with the same debug info +;; (and therefore callsite metadata) as a subsequent tail call in the alloc +;; context does not cause missing or incorrect cloning. This test is otherwise +;; the same as tailcall.ll. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -stats %s -S 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=IR + +source_filename = "tailcall-aliased-location2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16 + +declare void @_Z2a1v() #0 + +declare void @_Z2a2v() #0 + +; Function Attrs: noinline +; IR-LABEL: @_Z3barv() +define ptr @_Z3barv() local_unnamed_addr #0 { +entry: + ; IR: call ptr @_Znam(i64 10) #[[NOTCOLD:[0-9]+]] + %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare ptr @_Znam(i64) #1 +declare void @blah() + +; Function Attrs: noinline +; IR-LABEL: @_Z3bazv() +define ptr @_Z3bazv() #0 { +entry: + ; IR: call ptr @_Z3barv() + %call = tail call ptr @_Z3barv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @_Z3foov() +define ptr @_Z3foov() #0 { +entry: + ; IR: call ptr @_Z3bazv() + %call = tail call ptr @_Z3bazv() + ret ptr %call +} + +; Function Attrs: noinline +; IR-LABEL: @main() +define i32 @main() #0 { + ;; The first call to foo is part of a cold context, and should use the + ;; original functions. + ;; allocation. The latter should call the cloned functions. + ; IR: call ptr @_Z3foov() + %call = tail call ptr @_Z3foov(), !callsite !6 + ;; Subsequent call to another callee but with the same debug location / callsite id + call void @blah(), !callsite !6 + ;; The second call to foo is part of a cold context, and should call the + ;; cloned functions. + ; IR: call ptr @_Z3foov.memprof.1() + %call1 = tail call ptr @_Z3foov(), !callsite !7 + %2 = load ptr, ptr @a, align 16 + call void %2(), !callsite !10 + ret i32 0 +} + +; IR-LABEL: @_Z3barv.memprof.1() +; IR: call ptr @_Znam(i64 10) #[[COLD:[0-9]+]] +; IR-LABEL: @_Z3bazv.memprof.1() +; IR: call ptr @_Z3barv.memprof.1() +; IR-LABEL: @_Z3foov.memprof.1() +; IR: call ptr @_Z3bazv.memprof.1() + +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + +; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls +; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls +; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls + +attributes #0 = { noinline } +attributes #1 = { nobuiltin allocsize(0) } +attributes #2 = { builtin allocsize(0) } + +!0 = !{!1, !3, !8} +!1 = !{!2, !"notcold"} +!2 = !{i64 3186456655321080972, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 3186456655321080972, i64 -3421689549917153178} +!5 = !{i64 3186456655321080972} +!6 = !{i64 8632435727821051414} +!7 = !{i64 -3421689549917153178} +!8 = !{!9, !"notcold"} +!9 = !{i64 3186456655321080972, i64 1} +!10 = !{i64 1} diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn index 243a92f2e62587..aa594df8c164a1 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn @@ -71,6 +71,7 @@ static_library("LLVMBPFCodeGen") { "BPFISelLowering.cpp", "BPFInstrInfo.cpp", "BPFMCInstLower.cpp", + "BPFMIChecking.cpp", "BPFMIPeephole.cpp", "BPFMISimplifyPatchable.cpp", "BPFPreserveDIType.cpp", diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 8413691910189a..d22df6a7857c1d 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -263,7 +263,8 @@ std::unique_ptr createSparsificationAndBufferizationPass( bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vectorLength, bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen, - SparseEmitStrategy emitStrategy); + SparseEmitStrategy emitStrategy, + SparseParallelizationStrategy parallelizationStrategy); //===----------------------------------------------------------------------===// // Sparse Iteration Transform Passes diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td index 8ec18a1e186481..a534381bd5c2f3 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -496,6 +496,23 @@ def SparsificationAndBufferization : Pass<"sparsification-and-bufferization", "M "Emit (experimental) loops (with sparse.iterate)."), clEnumValN(mlir::SparseEmitStrategy::kDebugInterface, "debug-interface", "Emit non-functional but easy-to-read interfaces to debug."))}]>, + Option<"parallelization", "parallelization-strategy", "mlir::SparseParallelizationStrategy", + "mlir::SparseParallelizationStrategy::kNone", + "Set the parallelization strategy", [{llvm::cl::values( + clEnumValN(mlir::SparseParallelizationStrategy::kNone, "none", + "Turn off sparse parallelization."), + clEnumValN(mlir::SparseParallelizationStrategy::kDenseOuterLoop, + "dense-outer-loop", + "Enable dense outer loop sparse parallelization."), + clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageOuterLoop, + "any-storage-outer-loop", + "Enable sparse parallelization regardless of storage for the outer loop."), + clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop, + "dense-any-loop", + "Enable dense parallelization for any loop."), + clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, + "any-storage-any-loop", + "Enable sparse parallelization for any storage and loop."))}]>, ]; } diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5f680e8eca7559..60113bdef16a23 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1124,17 +1124,6 @@ struct ConversionConfig { // already been modified) and iterators into past IR state cannot be // represented at the moment. RewriterBase::Listener *listener = nullptr; - - /// If set to "true", the dialect conversion attempts to build source/target/ - /// argument materializations through the type converter API in lieu of - /// builtin.unrealized_conversion_cast ops. The conversion process fails if - /// at least one materialization could not be built. - /// - /// If set to "false", the dialect conversion does not does not build any - /// custom materializations and instead inserts - /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR - /// is valid. - bool buildMaterializations = true; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index 12e330ac7efbdf..abc4a4c252841b 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -45,7 +45,8 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm, /*enableVLAVectorization=*/options.armSVE, /*enableSIMDIndex32=*/options.force32BitVectorIndices, options.enableGPULibgen, - options.sparsificationOptions().sparseEmitStrategy)); + options.sparsificationOptions().sparseEmitStrategy, + options.sparsificationOptions().parallelizationStrategy)); // Bail-early for test setup. if (options.testBufferizationAnalysisOnly) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp index e088328848c9c8..6e882a8d0ff30a 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp @@ -78,7 +78,8 @@ class SparsificationAndBufferizationPass const SparsificationOptions &sparsificationOptions, bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vl, bool vla, bool index32, - bool gpu, SparseEmitStrategy emitStrategy) + bool gpu, SparseEmitStrategy emitStrategy, + SparseParallelizationStrategy parallelizationStrategy) : bufferizationOptions(bufferizationOptions), sparsificationOptions(sparsificationOptions), createSparseDeallocs(createSparseDeallocs), @@ -90,6 +91,7 @@ class SparsificationAndBufferizationPass enableSIMDIndex32 = index32; enableGPULibgen = gpu; sparseEmitStrategy = emitStrategy; + parallelization = parallelizationStrategy; } /// Bufferize all dense ops. This assumes that no further analysis is needed @@ -124,6 +126,9 @@ class SparsificationAndBufferizationPass // Overrides the default emit strategy using user-provided value. this->sparsificationOptions.sparseEmitStrategy = sparseEmitStrategy; + // Overrides the default parallelization strategy using user-provided value. + this->sparsificationOptions.parallelizationStrategy = parallelization; + // Run enabling transformations. { OpPassManager pm("builtin.module"); @@ -248,10 +253,12 @@ std::unique_ptr mlir::createSparsificationAndBufferizationPass( bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vectorLength, bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen, - SparseEmitStrategy emitStrategy) { + SparseEmitStrategy emitStrategy, + SparseParallelizationStrategy parallelizationStrategy) { return std::make_unique< mlir::sparse_tensor::SparsificationAndBufferizationPass>( bufferizationOptions, sparsificationOptions, createSparseDeallocs, enableRuntimeLibrary, enableBufferInitialization, vectorLength, - enableVLAVectorization, enableSIMDIndex32, enableGPULibgen, emitStrategy); + enableVLAVectorization, enableSIMDIndex32, enableGPULibgen, emitStrategy, + parallelizationStrategy); } diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index cc9c9495e5155c..b23fb97959ed67 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -702,12 +702,14 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { return rewrite->getKind() == Kind::UnresolvedMaterialization; } - void rollback() override; - UnrealizedConversionCastOp getOperation() const { return cast(op); } + void rollback() override; + + void cleanup(RewriterBase &rewriter) override; + /// Return the type converter of this materialization (which may be null). const TypeConverter *getConverter() const { return converterAndKind.getPointer(); @@ -764,7 +766,7 @@ namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(MLIRContext *ctx, const ConversionConfig &config) - : context(ctx), eraseRewriter(ctx), config(config) {} + : context(ctx), config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -832,7 +834,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { //===--------------------------------------------------------------------===// // Materializations //===--------------------------------------------------------------------===// - /// Build an unresolved materialization operation given an output type and set /// of input operands. Value buildUnresolvedMaterialization(MaterializationKind kind, @@ -881,7 +882,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given op (unless it was already erased). void eraseOp(Operation *op) override { - if (wasErased(op)) + if (erased.contains(op)) return; op->dropAllUses(); RewriterBase::eraseOp(op); @@ -889,24 +890,17 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given block (unless it was already erased). void eraseBlock(Block *block) override { - if (wasErased(block)) + if (erased.contains(block)) return; assert(block->empty() && "expected empty block"); block->dropAllDefinedValueUses(); RewriterBase::eraseBlock(block); } - bool wasErased(void *ptr) const { return erased.contains(ptr); } - - bool wasErased(OperationRewrite *rewrite) const { - return wasErased(rewrite->getOperation()); - } - void notifyOperationErased(Operation *op) override { erased.insert(op); } void notifyBlockErased(Block *block) override { erased.insert(block); } - private: /// Pointers to all erased operations and blocks. DenseSet erased; }; @@ -918,11 +912,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// MLIR context. MLIRContext *context; - /// A rewriter that keeps track of ops/block that were already erased and - /// skips duplicate op/block erasures. This rewriter is used during the - /// "cleanup" phase. - SingleEraseRewriter eraseRewriter; - // Mapping between replaced values that differ in type. This happens when // replacing a value with one of a different type. ConversionValueMapping mapping; @@ -1069,6 +1058,10 @@ void UnresolvedMaterializationRewrite::rollback() { op->erase(); } +void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) { + rewriter.eraseOp(op); +} + void ConversionPatternRewriterImpl::applyRewrites() { // Commit all rewrites. IRRewriter rewriter(context, config.listener); @@ -1076,6 +1069,7 @@ void ConversionPatternRewriterImpl::applyRewrites() { rewrite->commit(rewriter); // Clean up all rewrites. + SingleEraseRewriter eraseRewriter(context); for (auto &rewrite : rewrites) rewrite->cleanup(eraseRewriter); } @@ -2359,6 +2353,12 @@ struct OperationConverter { ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping); + /// Legalize any unresolved type materializations. + LogicalResult legalizeUnresolvedMaterializations( + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping); + /// Legalize an operation result that was marked as "erased". LogicalResult legalizeErasedResult(Operation *op, OpResult result, @@ -2405,56 +2405,6 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, return success(); } -static LogicalResult -legalizeUnresolvedMaterialization(RewriterBase &rewriter, - UnresolvedMaterializationRewrite *rewrite) { - UnrealizedConversionCastOp op = rewrite->getOperation(); - assert(!op.use_empty() && - "expected that dead materializations have already been DCE'd"); - Operation::operand_range inputOperands = op.getOperands(); - Type outputType = op.getResultTypes()[0]; - - // Try to materialize the conversion. - if (const TypeConverter *converter = rewrite->getConverter()) { - rewriter.setInsertionPoint(op); - Value newMaterialization; - switch (rewrite->getMaterializationKind()) { - case MaterializationKind::Argument: - // Try to materialize an argument conversion. - newMaterialization = converter->materializeArgumentConversion( - rewriter, op->getLoc(), outputType, inputOperands); - if (newMaterialization) - break; - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; - case MaterializationKind::Target: - newMaterialization = converter->materializeTargetConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - case MaterializationKind::Source: - newMaterialization = converter->materializeSourceConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - } - if (newMaterialization) { - assert(newMaterialization.getType() == outputType && - "materialization callback produced value of incorrect type"); - rewriter.replaceOp(op, newMaterialization); - return success(); - } - } - - InFlightDiagnostic diag = op->emitError() - << "failed to legalize unresolved materialization " - "from (" - << inputOperands.getTypes() << ") to " << outputType - << " that remained live after conversion"; - diag.attachNote(op->getUsers().begin()->getLoc()) - << "see existing live user here: " << *op->getUsers().begin(); - return failure(); -} - LogicalResult OperationConverter::convertOperations(ArrayRef ops) { if (ops.empty()) return success(); @@ -2496,37 +2446,6 @@ LogicalResult OperationConverter::convertOperations(ArrayRef ops) { } else { rewriterImpl.applyRewrites(); } - - // Gather all unresolved materializations. - SmallVector allCastOps; - DenseMap rewriteMap; - for (std::unique_ptr &rewrite : rewriterImpl.rewrites) { - auto *mat = dyn_cast(rewrite.get()); - if (!mat) - continue; - if (rewriterImpl.eraseRewriter.wasErased(mat)) - continue; - allCastOps.push_back(mat->getOperation()); - rewriteMap[mat->getOperation()] = mat; - } - - // Reconcile all UnrealizedConversionCastOps that were inserted by the - // dialect conversion frameworks. (Not the one that were inserted by - // patterns.) - SmallVector remainingCastOps; - reconcileUnrealizedCasts(allCastOps, &remainingCastOps); - - // Try to legalize all unresolved materializations. - if (config.buildMaterializations) { - IRRewriter rewriter(rewriterImpl.context, config.listener); - for (UnrealizedConversionCastOp castOp : remainingCastOps) { - auto it = rewriteMap.find(castOp.getOperation()); - assert(it != rewriteMap.end() && "inconsistent state"); - if (failed(legalizeUnresolvedMaterialization(rewriter, it->second))) - return failure(); - } - } - return success(); } @@ -2540,6 +2459,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl, inverseMapping))) return failure(); + if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl, + inverseMapping))) + return failure(); return success(); } @@ -2655,6 +2577,279 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( return success(); } +/// Replace the results of a materialization operation with the given values. +static void +replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl, + ResultRange matResults, ValueRange values, + DenseMap> &inverseMapping) { + matResults.replaceAllUsesWith(values); + + // For each of the materialization results, update the inverse mappings to + // point to the replacement values. + for (auto [matResult, newValue] : llvm::zip(matResults, values)) { + auto inverseMapIt = inverseMapping.find(matResult); + if (inverseMapIt == inverseMapping.end()) + continue; + + // Update the reverse mapping, or remove the mapping if we couldn't update + // it. Not being able to update signals that the mapping would have become + // circular (i.e. %foo -> newValue -> %foo), which may occur as values are + // propagated through temporary materializations. We simply drop the + // mapping, and let the post-conversion replacement logic handle updating + // uses. + for (Value inverseMapVal : inverseMapIt->second) + if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue)) + rewriterImpl.mapping.erase(inverseMapVal); + } +} + +/// Compute all of the unresolved materializations that will persist beyond the +/// conversion process, and require inserting a proper user materialization for. +static void computeNecessaryMaterializations( + DenseMap + &materializationOps, + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping, + SetVector &necessaryMaterializations) { + // Helper function to check if the given value or a not yet materialized + // replacement of the given value is live. + // Note: `inverseMapping` maps from replaced values to original values. + auto isLive = [&](Value value) { + auto findFn = [&](Operation *user) { + auto matIt = materializationOps.find(user); + if (matIt != materializationOps.end()) + return !necessaryMaterializations.count(matIt->second); + return rewriterImpl.isOpIgnored(user); + }; + // A worklist is needed because a value may have gone through a chain of + // replacements and each of the replaced values may have live users. + SmallVector worklist; + worklist.push_back(value); + while (!worklist.empty()) { + Value next = worklist.pop_back_val(); + if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end()) + return true; + // This value may be replacing another value that has a live user. + llvm::append_range(worklist, inverseMapping.lookup(next)); + } + return false; + }; + + llvm::unique_function lookupRemappedValue = + [&](Value invalidRoot, Value value, Type type) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); + if (remappedValue.getType() == type && remappedValue != invalidRoot) + return remappedValue; + + // Check to see if the input is a materialization operation that + // provides an inverse conversion. We just check blindly for + // UnrealizedConversionCastOp here, but it has no effect on correctness. + auto inputCastOp = value.getDefiningOp(); + if (inputCastOp && inputCastOp->getNumOperands() == 1) + return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0), + type); + + return Value(); + }; + + SetVector worklist; + for (auto &rewrite : rewriterImpl.rewrites) { + auto *mat = dyn_cast(rewrite.get()); + if (!mat) + continue; + materializationOps.try_emplace(mat->getOperation(), mat); + worklist.insert(mat); + } + while (!worklist.empty()) { + UnresolvedMaterializationRewrite *mat = worklist.pop_back_val(); + UnrealizedConversionCastOp op = mat->getOperation(); + + // We currently only handle target materializations here. + assert(op->getNumResults() == 1 && "unexpected materialization type"); + OpResult opResult = op->getOpResult(0); + Type outputType = opResult.getType(); + Operation::operand_range inputOperands = op.getOperands(); + + // Try to forward propagate operands for user conversion casts that result + // in the input types of the current cast. + for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) { + auto castOp = dyn_cast(user); + if (!castOp) + continue; + if (castOp->getResultTypes() == inputOperands.getTypes()) { + replaceMaterialization(rewriterImpl, user->getResults(), inputOperands, + inverseMapping); + necessaryMaterializations.remove(materializationOps.lookup(user)); + } + } + + // Try to avoid materializing a resolved materialization if possible. + // Handle the case of a 1-1 materialization. + if (inputOperands.size() == 1) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = + lookupRemappedValue(opResult, inputOperands[0], outputType); + if (remappedValue && remappedValue != opResult) { + replaceMaterialization(rewriterImpl, opResult, remappedValue, + inverseMapping); + necessaryMaterializations.remove(mat); + continue; + } + } else { + // TODO: Avoid materializing other types of conversions here. + } + + // If the materialization does not have any live users, we don't need to + // generate a user materialization for it. + bool isMaterializationLive = isLive(opResult); + if (!isMaterializationLive) + continue; + if (!necessaryMaterializations.insert(mat)) + continue; + + // Reprocess input materializations to see if they have an updated status. + for (Value input : inputOperands) { + if (auto parentOp = input.getDefiningOp()) { + if (auto *mat = materializationOps.lookup(parentOp)) + worklist.insert(mat); + } + } + } +} + +/// Legalize the given unresolved materialization. Returns success if the +/// materialization was legalized, failure otherise. +static LogicalResult legalizeUnresolvedMaterialization( + UnresolvedMaterializationRewrite &mat, + DenseMap + &materializationOps, + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping) { + auto findLiveUser = [&](auto &&users) { + auto liveUserIt = llvm::find_if_not( + users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); }); + return liveUserIt == users.end() ? nullptr : *liveUserIt; + }; + + llvm::unique_function lookupRemappedValue = + [&](Value value, Type type) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); + if (remappedValue.getType() == type) + return remappedValue; + return Value(); + }; + + UnrealizedConversionCastOp op = mat.getOperation(); + if (!rewriterImpl.ignoredOps.insert(op)) + return success(); + + // We currently only handle target materializations here. + OpResult opResult = op->getOpResult(0); + Operation::operand_range inputOperands = op.getOperands(); + Type outputType = opResult.getType(); + + // If any input to this materialization is another materialization, resolve + // the input first. + for (Value value : op->getOperands()) { + auto valueCast = value.getDefiningOp(); + if (!valueCast) + continue; + + auto matIt = materializationOps.find(valueCast); + if (matIt != materializationOps.end()) + if (failed(legalizeUnresolvedMaterialization( + *matIt->second, materializationOps, rewriter, rewriterImpl, + inverseMapping))) + return failure(); + } + + // Perform a last ditch attempt to avoid materializing a resolved + // materialization if possible. + // Handle the case of a 1-1 materialization. + if (inputOperands.size() == 1) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = lookupRemappedValue(inputOperands[0], outputType); + if (remappedValue && remappedValue != opResult) { + replaceMaterialization(rewriterImpl, opResult, remappedValue, + inverseMapping); + return success(); + } + } else { + // TODO: Avoid materializing other types of conversions here. + } + + // Try to materialize the conversion. + if (const TypeConverter *converter = mat.getConverter()) { + rewriter.setInsertionPoint(op); + Value newMaterialization; + switch (mat.getMaterializationKind()) { + case MaterializationKind::Argument: + // Try to materialize an argument conversion. + newMaterialization = converter->materializeArgumentConversion( + rewriter, op->getLoc(), outputType, inputOperands); + if (newMaterialization) + break; + // If an argument materialization failed, fallback to trying a target + // materialization. + [[fallthrough]]; + case MaterializationKind::Target: + newMaterialization = converter->materializeTargetConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + case MaterializationKind::Source: + newMaterialization = converter->materializeSourceConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + } + if (newMaterialization) { + assert(newMaterialization.getType() == outputType && + "materialization callback produced value of incorrect type"); + replaceMaterialization(rewriterImpl, opResult, newMaterialization, + inverseMapping); + return success(); + } + } + + InFlightDiagnostic diag = op->emitError() + << "failed to legalize unresolved materialization " + "from (" + << inputOperands.getTypes() << ") to " << outputType + << " that remained live after conversion"; + if (Operation *liveUser = findLiveUser(op->getUsers())) { + diag.attachNote(liveUser->getLoc()) + << "see existing live user here: " << *liveUser; + } + return failure(); +} + +LogicalResult OperationConverter::legalizeUnresolvedMaterializations( + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping) { + // As an initial step, compute all of the inserted materializations that we + // expect to persist beyond the conversion process. + DenseMap materializationOps; + SetVector necessaryMaterializations; + computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl, + inverseMapping, necessaryMaterializations); + + // Once computed, legalize any necessary materializations. + for (auto *mat : necessaryMaterializations) { + if (failed(legalizeUnresolvedMaterialization( + *mat, materializationOps, rewriter, rewriterImpl, inverseMapping))) + return failure(); + } + return success(); +} + LogicalResult OperationConverter::legalizeErasedResult( Operation *op, OpResult result, ConversionPatternRewriterImpl &rewriterImpl) { diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 75362378daaaaa..156a8a468d5b42 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -1286,6 +1286,7 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor> to i64 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor> to i64 +// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> @@ -1298,8 +1299,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: nvvm.wgmma.fence.aligned // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> +// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> +// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvvm.wgmma.mma_async // CHECK: nvvm.wgmma.mma_async // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir index ab18ce05e355d3..a192434c5accf8 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir @@ -80,7 +80,6 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref) -> memref // expected-error @+1 {{failed to legalize unresolved materialization from ('memref') to 'memref>' that remained live after conversion}} %1 = bufferization.to_memref %0 : memref> - // expected-note @below{{see existing live user here}} return %1 : memref> } diff --git a/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir b/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir new file mode 100644 index 00000000000000..d97d6e58a3df2d --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir @@ -0,0 +1,38 @@ +// RUN: mlir-opt %s --sparsification-and-bufferization | FileCheck %s --check-prefix=CHECK-NOPARA +// RUN: mlir-opt %s --sparsification-and-bufferization="parallelization-strategy=any-storage-any-loop" | FileCheck %s --check-prefix=CHECK-PARA + +// Test to ensure we can pass parallelization flags into +// the mini sparsification and bufferization pipeline. + +#SparseMatrix = #sparse_tensor.encoding<{ + map = (d0, d1) -> (d0 : compressed, d1 : compressed) +}> + +#trait_ss = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, // A + affine_map<(i,j) -> (i,j)> // X (out) + ], + iterator_types = ["parallel", "parallel"], + doc = "X(i,j) = A(i,j) * SCALE" +} + +// +// CHECK-NOPARA-LABEL: func.func @scale_ss +// CHECK-NOPARA: scf.for +// +// CHECK-PARA-LABEL: func.func @scale_ss +// CHECK-PARA: scf.parallel +// +func.func @scale_ss(%scale: f32, + %arga: tensor, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_ss + ins(%arga: tensor) + outs(%argx: tensor) { + ^bb(%a: f32, %x: f32): + %0 = arith.mulf %a, %scale : f32 + linalg.yield %0 : f32 + } -> tensor + return %0 : tensor +} diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir index f130adff42f8cd..cf2c9f6a8ec441 100644 --- a/mlir/test/Transforms/test-legalize-type-conversion.mlir +++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir @@ -4,7 +4,6 @@ func.func @test_invalid_arg_materialization( // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}} %arg0: i16) { - // expected-note@below{{see existing live user here}} "foo.return"(%arg0) : (i16) -> () } @@ -23,7 +22,6 @@ func.func @test_valid_arg_materialization(%arg0: i64) { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -32,7 +30,6 @@ func.func @test_invalid_result_materialization() { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -52,7 +49,6 @@ func.func @test_transitive_use_materialization() { func.func @test_transitive_use_invalid_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.another_type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -103,9 +99,9 @@ func.func @test_block_argument_not_converted() { func.func @test_signature_conversion_no_converter() { "test.signature_conversion_no_converter"() ({ // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}} + // expected-note@below {{see existing live user here}} ^bb0(%arg0: f32): "test.type_consumer"(%arg0) : (f32) -> () - // expected-note@below{{see existing live user here}} "test.return"(%arg0) : (f32) -> () }) : () -> () return