diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 5fb32a1ea67848..08ce892054874c 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -71,14 +71,15 @@ struct SegmentInfo { uint64_t FileOffset; /// Offset in the file. uint64_t FileSize; /// Size in file. uint64_t Alignment; /// Alignment of the segment. + bool IsExecutable; /// Is the executable bit set on the Segment? void print(raw_ostream &OS) const { - OS << "SegmentInfo { Address: 0x" - << Twine::utohexstr(Address) << ", Size: 0x" - << Twine::utohexstr(Size) << ", FileOffset: 0x" + OS << "SegmentInfo { Address: 0x" << Twine::utohexstr(Address) + << ", Size: 0x" << Twine::utohexstr(Size) << ", FileOffset: 0x" << Twine::utohexstr(FileOffset) << ", FileSize: 0x" << Twine::utohexstr(FileSize) << ", Alignment: 0x" - << Twine::utohexstr(Alignment) << "}"; + << Twine::utohexstr(Alignment) << ", " << (IsExecutable ? "x" : " ") + << "}"; }; }; diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index cd137f457c1bdc..1347047e1b7060 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -2021,6 +2021,9 @@ BinaryContext::getBaseAddressForMapping(uint64_t MMapAddress, // Find a segment with a matching file offset. for (auto &KV : SegmentMapInfo) { const SegmentInfo &SegInfo = KV.second; + // Only consider executable segments. + if (!SegInfo.IsExecutable) + continue; // FileOffset is got from perf event, // and it is equal to alignDown(SegInfo.FileOffset, pagesize). // If the pagesize is not equal to SegInfo.Alignment. diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index fcde6f5f4642c8..0a63148379d900 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2043,7 +2043,8 @@ std::error_code DataAggregator::parseMMapEvents() { // size of the mapping, but we know it should not exceed the segment // alignment value. Hence we are performing an approximate check. return SegInfo.Address >= MMapInfo.MMapAddress && - SegInfo.Address - MMapInfo.MMapAddress < SegInfo.Alignment; + SegInfo.Address - MMapInfo.MMapAddress < SegInfo.Alignment && + SegInfo.IsExecutable; }); if (!MatchFound) { errs() << "PERF2BOLT-WARNING: ignoring mapping of " << NameToUse diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 2f0fa6038bde02..46cd771f61eeec 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -526,11 +526,9 @@ Error RewriteInstance::discoverStorage() { NextAvailableOffset = std::max(NextAvailableOffset, Phdr.p_offset + Phdr.p_filesz); - BC->SegmentMapInfo[Phdr.p_vaddr] = SegmentInfo{Phdr.p_vaddr, - Phdr.p_memsz, - Phdr.p_offset, - Phdr.p_filesz, - Phdr.p_align}; + BC->SegmentMapInfo[Phdr.p_vaddr] = SegmentInfo{ + Phdr.p_vaddr, Phdr.p_memsz, Phdr.p_offset, + Phdr.p_filesz, Phdr.p_align, ((Phdr.p_flags & ELF::PF_X) != 0)}; if (BC->TheTriple->getArch() == llvm::Triple::x86_64 && Phdr.p_vaddr >= BinaryContext::KernelStartX86_64) BC->IsLinuxKernel = true; diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 6c3288146b7905..05b898d34af56c 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -160,13 +160,14 @@ TEST_P(BinaryContextTester, FlushPendingRelocJUMP26) { TEST_P(BinaryContextTester, BaseAddress) { // Check that base address calculation is correct for a binary with the // following segment layout: - BC->SegmentMapInfo[0] = SegmentInfo{0, 0x10e8c2b4, 0, 0x10e8c2b4, 0x1000}; + BC->SegmentMapInfo[0] = + SegmentInfo{0, 0x10e8c2b4, 0, 0x10e8c2b4, 0x1000, true}; BC->SegmentMapInfo[0x10e8d2b4] = - SegmentInfo{0x10e8d2b4, 0x3952faec, 0x10e8c2b4, 0x3952faec, 0x1000}; + SegmentInfo{0x10e8d2b4, 0x3952faec, 0x10e8c2b4, 0x3952faec, 0x1000, true}; BC->SegmentMapInfo[0x4a3bddc0] = - SegmentInfo{0x4a3bddc0, 0x148e828, 0x4a3bbdc0, 0x148e828, 0x1000}; + SegmentInfo{0x4a3bddc0, 0x148e828, 0x4a3bbdc0, 0x148e828, 0x1000, true}; BC->SegmentMapInfo[0x4b84d5e8] = - SegmentInfo{0x4b84d5e8, 0x294f830, 0x4b84a5e8, 0x3d3820, 0x1000}; + SegmentInfo{0x4b84d5e8, 0x294f830, 0x4b84a5e8, 0x3d3820, 0x1000, true}; std::optional BaseAddress = BC->getBaseAddressForMapping(0x7f13f5556000, 0x10e8c000); @@ -181,13 +182,13 @@ TEST_P(BinaryContextTester, BaseAddress2) { // Check that base address calculation is correct for a binary if the // alignment in ELF file are different from pagesize. // The segment layout is as follows: - BC->SegmentMapInfo[0] = SegmentInfo{0, 0x2177c, 0, 0x2177c, 0x10000}; + BC->SegmentMapInfo[0] = SegmentInfo{0, 0x2177c, 0, 0x2177c, 0x10000, true}; BC->SegmentMapInfo[0x31860] = - SegmentInfo{0x31860, 0x370, 0x21860, 0x370, 0x10000}; + SegmentInfo{0x31860, 0x370, 0x21860, 0x370, 0x10000, true}; BC->SegmentMapInfo[0x41c20] = - SegmentInfo{0x41c20, 0x1f8, 0x21c20, 0x1f8, 0x10000}; + SegmentInfo{0x41c20, 0x1f8, 0x21c20, 0x1f8, 0x10000, true}; BC->SegmentMapInfo[0x54e18] = - SegmentInfo{0x54e18, 0x51, 0x24e18, 0x51, 0x10000}; + SegmentInfo{0x54e18, 0x51, 0x24e18, 0x51, 0x10000, true}; std::optional BaseAddress = BC->getBaseAddressForMapping(0xaaaaea444000, 0x21000); @@ -197,3 +198,22 @@ TEST_P(BinaryContextTester, BaseAddress2) { BaseAddress = BC->getBaseAddressForMapping(0xaaaaea444000, 0x11000); ASSERT_FALSE(BaseAddress.has_value()); } + +TEST_P(BinaryContextTester, BaseAddressSegmentsSmallerThanAlignment) { + // Check that the correct segment is used to compute the base address + // when multiple segments are close together in the ELF file (closer + // than the required alignment in the process space). + // See https://github.com/llvm/llvm-project/issues/109384 + BC->SegmentMapInfo[0] = SegmentInfo{0, 0x1d1c, 0, 0x1d1c, 0x10000, false}; + BC->SegmentMapInfo[0x11d40] = + SegmentInfo{0x11d40, 0x11e0, 0x1d40, 0x11e0, 0x10000, true}; + BC->SegmentMapInfo[0x22f20] = + SegmentInfo{0x22f20, 0x10e0, 0x2f20, 0x1f0, 0x10000, false}; + BC->SegmentMapInfo[0x33110] = + SegmentInfo{0x33110, 0x89, 0x3110, 0x88, 0x10000, false}; + + std::optional BaseAddress = + BC->getBaseAddressForMapping(0xaaaaaaab1000, 0x1000); + ASSERT_TRUE(BaseAddress.has_value()); + ASSERT_EQ(*BaseAddress, 0xaaaaaaaa0000ULL); +} \ No newline at end of file diff --git a/clang-tools-extra/docs/clang-tidy/ExternalClang-TidyExamples.rst b/clang-tools-extra/docs/clang-tidy/ExternalClang-TidyExamples.rst new file mode 100644 index 00000000000000..3d654e232a3ada --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/ExternalClang-TidyExamples.rst @@ -0,0 +1,30 @@ +============================ +External Clang-Tidy Examples +============================ + +Introduction +============ + +This page provides examples of what people have done with :program:`clang-tidy` that +might serve as useful guides (or starting points) to develop your own checks. +They may be helpful for necessary things such as how to write the `CMakeLists.txt` +for an out-of-tree plugin of :program:`clang-tidy` checks. + +If you know of (or wrote!) a tool or project using :program:`clang-tidy`, please share it +on `the Discourse forums (Clang Frontend category) +`_ for wider visibility and open a +pull-request on `LLVM Github`_ to have it added here. Since the primary purpose of +this page is to provide examples that can help developers, the listed projects should +have code available. + +As :program:`clang-tidy` is using, for example, the AST Matchers and diagnostics of Clang, +`External Clang Examples`_ may also be useful to look at for such examples. + +.. _LLVM Github: https://github.com/llvm/llvm-project +.. _External Clang Examples: https://clang.llvm.org/docs/ExternalClangExamples.html + +List of projects and tools +========================== + +``_ + "This folder contains :program:`clang-tidy` plugins." diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index c8fc34c61caeb5..e38141bdb8be1f 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -12,6 +12,7 @@ See also: The list of clang-tidy checks Clang-tidy IDE/Editor Integrations Getting Involved + External Clang-Tidy Examples :program:`clang-tidy` is a clang-based C++ "linter" tool. Its purpose is to provide an extensible framework for diagnosing and fixing typical programming diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 22729e9e933fb7..3d8d289607c7e6 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -942,7 +942,7 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) -data ${BOLT_FDATA} -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats -use-gnu-stack - -split-strategy=cdsplit -update-debug-sections + -update-debug-sections ${BOLT_NO_LBR} COMMENT "Optimizing Clang with BOLT" USES_TERMINAL diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8af9bf8d3a254a..31cd43dd5943ab 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -298,6 +298,9 @@ Attribute Changes in Clang not change the behaviour of the compiler, as this was true for previous versions. +- Fix a bug where clang doesn't automatically apply the ``[[gsl::Owner]]`` or + ``[[gsl::Pointer]]`` to STL explicit template specialization decls. (#GH109442) + Improvements to Clang's diagnostics ----------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 20f038a0a9bbde..6fff562165080a 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -599,14 +599,6 @@ TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) -// Bitcast - -BUILTIN(__nvvm_bitcast_f2i, "if", "") -BUILTIN(__nvvm_bitcast_i2f, "fi", "") - -BUILTIN(__nvvm_bitcast_ll2d, "dLLi", "") -BUILTIN(__nvvm_bitcast_d2ll, "LLid", "") - // FNS TARGET_BUILTIN(__nvvm_fns, "UiUiUii", "n", PTX60) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 8bd5abf2bf9643..fd8aa8de79b49f 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -12587,8 +12587,7 @@ void ASTContext::forEachMultiversionedFunctionVersion( FD->getDeclContext()->getRedeclContext()->lookup(FD->getDeclName())) { FunctionDecl *CurFD = CurDecl->getAsFunction()->getMostRecentDecl(); if (CurFD && hasSameType(CurFD->getType(), FD->getType()) && - !SeenDecls.contains(CurFD)) { - SeenDecls.insert(CurFD); + SeenDecls.insert(CurFD).second) { Pred(CurFD); } } diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 68c3cdff712fb2..e89863a231bed4 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3097,12 +3097,11 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { QualType ElementType = E->getAllocatedType(); std::optional ElemT = classify(ElementType); unsigned PlacementArgs = E->getNumPlacementArgs(); + const FunctionDecl *OperatorNew = E->getOperatorNew(); + const Expr *PlacementDest = nullptr; bool IsNoThrow = false; - // FIXME: Better diagnostic. diag::note_constexpr_new_placement if (PlacementArgs != 0) { - // The only new-placement list we support is of the form (std::nothrow). - // // FIXME: There is no restriction on this, but it's not clear that any // other form makes any sense. We get here for cases such as: // @@ -3111,27 +3110,43 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { // (which should presumably be valid only if N is a multiple of // alignof(int), and in any case can't be deallocated unless N is // alignof(X) and X has new-extended alignment). - if (PlacementArgs != 1 || !E->getPlacementArg(0)->getType()->isNothrowT()) - return this->emitInvalid(E); + if (PlacementArgs == 1) { + const Expr *Arg1 = E->getPlacementArg(0); + if (Arg1->getType()->isNothrowT()) { + if (!this->discard(Arg1)) + return false; + IsNoThrow = true; + } else if (Ctx.getLangOpts().CPlusPlus26 && + OperatorNew->isReservedGlobalPlacementOperator()) { + // If we have a placement-new destination, we'll later use that instead + // of allocating. + PlacementDest = Arg1; + } else { + return this->emitInvalidNewDeleteExpr(E, E); + } - if (!this->discard(E->getPlacementArg(0))) - return false; - IsNoThrow = true; + } else { + return this->emitInvalid(E); + } + } else if (!OperatorNew->isReplaceableGlobalAllocationFunction()) { + return this->emitInvalidNewDeleteExpr(E, E); } const Descriptor *Desc; - if (ElemT) { - if (E->isArray()) - Desc = nullptr; // We're not going to use it in this case. - else - Desc = P.createDescriptor(E, *ElemT, Descriptor::InlineDescMD, - /*IsConst=*/false, /*IsTemporary=*/false, - /*IsMutable=*/false); - } else { - Desc = P.createDescriptor( - E, ElementType.getTypePtr(), - E->isArray() ? std::nullopt : Descriptor::InlineDescMD, - /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false, Init); + if (!PlacementDest) { + if (ElemT) { + if (E->isArray()) + Desc = nullptr; // We're not going to use it in this case. + else + Desc = P.createDescriptor(E, *ElemT, Descriptor::InlineDescMD, + /*IsConst=*/false, /*IsTemporary=*/false, + /*IsMutable=*/false); + } else { + Desc = P.createDescriptor( + E, ElementType.getTypePtr(), + E->isArray() ? std::nullopt : Descriptor::InlineDescMD, + /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false, Init); + } } if (E->isArray()) { @@ -3148,26 +3163,42 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { PrimType SizeT = classifyPrim(Stripped->getType()); - if (!this->visit(Stripped)) - return false; - - if (ElemT) { - // N primitive elements. - if (!this->emitAllocN(SizeT, *ElemT, E, IsNoThrow, E)) + if (PlacementDest) { + if (!this->visit(PlacementDest)) + return false; + if (!this->visit(Stripped)) + return false; + if (!this->emitCheckNewTypeMismatchArray(SizeT, E, E)) return false; } else { - // N Composite elements. - if (!this->emitAllocCN(SizeT, Desc, IsNoThrow, E)) + if (!this->visit(Stripped)) return false; + + if (ElemT) { + // N primitive elements. + if (!this->emitAllocN(SizeT, *ElemT, E, IsNoThrow, E)) + return false; + } else { + // N Composite elements. + if (!this->emitAllocCN(SizeT, Desc, IsNoThrow, E)) + return false; + } } if (Init && !this->visitInitializer(Init)) return false; } else { - // Allocate just one element. - if (!this->emitAlloc(Desc, E)) - return false; + if (PlacementDest) { + if (!this->visit(PlacementDest)) + return false; + if (!this->emitCheckNewTypeMismatch(E, E)) + return false; + } else { + // Allocate just one element. + if (!this->emitAlloc(Desc, E)) + return false; + } if (Init) { if (ElemT) { @@ -3194,6 +3225,11 @@ template bool Compiler::VisitCXXDeleteExpr(const CXXDeleteExpr *E) { const Expr *Arg = E->getArgument(); + const FunctionDecl *OperatorDelete = E->getOperatorDelete(); + + if (!OperatorDelete->isReplaceableGlobalAllocationFunction()) + return this->emitInvalidNewDeleteExpr(E, E); + // Arg must be an lvalue. if (!this->visit(Arg)) return false; diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 0587ffd67eba51..739f6d2d8a7e95 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1286,6 +1286,80 @@ bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize, return Call(S, OpPC, F, VarArgSize); } +bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E, + std::optional ArraySize) { + const Pointer &Ptr = S.Stk.peek(); + + if (!CheckStore(S, OpPC, Ptr)) + return false; + + const auto *NewExpr = cast(E); + QualType StorageType = Ptr.getType(); + + if (isa_and_nonnull(Ptr.getFieldDesc()->asExpr())) { + // FIXME: Are there other cases where this is a problem? + StorageType = StorageType->getPointeeType(); + } + + const ASTContext &ASTCtx = S.getASTContext(); + QualType AllocType; + if (ArraySize) { + AllocType = ASTCtx.getConstantArrayType( + NewExpr->getAllocatedType(), + APInt(64, static_cast(*ArraySize), false), nullptr, + ArraySizeModifier::Normal, 0); + } else { + AllocType = NewExpr->getAllocatedType(); + } + + unsigned StorageSize = 1; + unsigned AllocSize = 1; + if (const auto *CAT = dyn_cast(AllocType)) + AllocSize = CAT->getZExtSize(); + if (const auto *CAT = dyn_cast(StorageType)) + StorageSize = CAT->getZExtSize(); + + if (AllocSize > StorageSize || + !ASTCtx.hasSimilarType(ASTCtx.getBaseElementType(AllocType), + ASTCtx.getBaseElementType(StorageType))) { + S.FFDiag(S.Current->getLocation(OpPC), + diag::note_constexpr_placement_new_wrong_type) + << StorageType << AllocType; + return false; + } + return true; +} + +bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E) { + assert(E); + const auto &Loc = S.Current->getSource(OpPC); + + if (const auto *NewExpr = dyn_cast(E)) { + const FunctionDecl *OperatorNew = NewExpr->getOperatorNew(); + + if (!S.getLangOpts().CPlusPlus26 && NewExpr->getNumPlacementArgs() > 0) { + S.FFDiag(Loc, diag::note_constexpr_new_placement) + << /*C++26 feature*/ 1 << E->getSourceRange(); + } else if (NewExpr->getNumPlacementArgs() == 1 && + !OperatorNew->isReservedGlobalPlacementOperator()) { + S.FFDiag(Loc, diag::note_constexpr_new_placement) + << /*Unsupported*/ 0 << E->getSourceRange(); + } else if (!OperatorNew->isReplaceableGlobalAllocationFunction()) { + S.FFDiag(Loc, diag::note_constexpr_new_non_replaceable) + << isa(OperatorNew) << OperatorNew; + } + } else { + const auto *DeleteExpr = cast(E); + const FunctionDecl *OperatorDelete = DeleteExpr->getOperatorDelete(); + if (!OperatorDelete->isReplaceableGlobalAllocationFunction()) { + S.FFDiag(Loc, diag::note_constexpr_new_non_replaceable) + << isa(OperatorDelete) << OperatorDelete; + } + } + + return false; +} + bool Interpret(InterpState &S, APValue &Result) { // The current stack frame when we started Interpret(). // This is being used by the ops to determine wheter diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 4aceb83eee0e71..1f4c302b26197f 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2947,6 +2947,17 @@ static inline bool IsConstantContext(InterpState &S, CodePtr OpPC) { return true; } +/// Check if the initializer and storage types of a placement-new expression +/// match. +bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E, + std::optional ArraySize = std::nullopt); + +template ::T> +bool CheckNewTypeMismatchArray(InterpState &S, CodePtr OpPC, const Expr *E) { + const auto &Size = S.Stk.pop(); + return CheckNewTypeMismatch(S, OpPC, E, static_cast(Size)); +} +bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E); //===----------------------------------------------------------------------===// // Read opcode arguments //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index e3a88c069847b8..36191f096aeb81 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -787,4 +787,18 @@ def Free : Opcode { let Args = [ArgBool]; } +def CheckNewTypeMismatch : Opcode { + let Args = [ArgExpr]; +} + +def InvalidNewDeleteExpr : Opcode { + let Args = [ArgExpr]; +} + +def CheckNewTypeMismatchArray : Opcode { + let Types = [IntegerTypeClass]; + let Args = [ArgExpr]; + let HasGroup = 1; +} + def IsConstantContext: Opcode; diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index a4925e84784af9..b6ea4440507ea1 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -486,3 +486,15 @@ bool RISCVTargetInfo::validateCpuSupports(StringRef Feature) const { bool RISCVTargetInfo::isValidFeatureName(StringRef Name) const { return llvm::RISCVISAInfo::isSupportedExtensionFeature(Name); } + +bool RISCVTargetInfo::validateGlobalRegisterVariable( + StringRef RegName, unsigned RegSize, bool &HasSizeMismatch) const { + if (RegName == "ra" || RegName == "sp" || RegName == "gp" || + RegName == "tp" || RegName.starts_with("x") || RegName.starts_with("a") || + RegName.starts_with("s") || RegName.starts_with("t")) { + unsigned XLen = getTriple().isArch64Bit() ? 64 : 32; + HasSizeMismatch = RegSize != XLen; + return true; + } + return false; +} diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h index b808ccc8e9cfe9..351ef21e197c4d 100644 --- a/clang/lib/Basic/Targets/RISCV.h +++ b/clang/lib/Basic/Targets/RISCV.h @@ -131,6 +131,9 @@ class RISCVTargetInfo : public TargetInfo { bool supportsCpuInit() const override { return getTriple().isOSLinux(); } bool validateCpuSupports(StringRef Feature) const override; bool isValidFeatureName(StringRef Name) const override; + + bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize, + bool &HasSizeMismatch) const override; }; class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo { public: diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d4177b0aa56cc4..04c14a576900e2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13653,7 +13653,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, else InitValStr = std::to_string(InitVal.getZExtValue()); std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr; - Value *EnumStrVal = Builder.CreateGlobalStringPtr(EnumStr); + Value *EnumStrVal = Builder.CreateGlobalString(EnumStr); ConstantInt *Flag = cast(EmitScalarExpr(E->getArg(1))); Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue()); @@ -18179,7 +18179,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, CallOps.push_back(Ops[i]); llvm::Function *F = CGM.getIntrinsic(ID); Value *Call = Builder.CreateCall(F, CallOps); - return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign(64)); + return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign()); } case PPC::BI__builtin_ppc_compare_and_swap: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 82caf65ac68d6b..b7f5b932c56b6f 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1807,7 +1807,7 @@ ScalarExprEmitter::VisitSYCLUniqueStableNameExpr(SYCLUniqueStableNameExpr *E) { ASTContext &Context = CGF.getContext(); unsigned AddrSpace = Context.getTargetAddressSpace(CGF.CGM.GetGlobalConstantAddressSpace()); - llvm::Constant *GlobalConstStr = Builder.CreateGlobalStringPtr( + llvm::Constant *GlobalConstStr = Builder.CreateGlobalString( E->ComputeName(Context), "__usn_str", AddrSpace); llvm::Type *ExprTy = ConvertType(E->getType()); diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 7ddaad61539a18..24547f836faf9f 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -1455,6 +1455,7 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) { // C99 6.8.4.1: The first substatement is executed if the expression compares // unequal to 0. The condition must be a scalar type. LexicalScope ConditionScope(*this, S.getCond()->getSourceRange()); + ApplyDebugLocation DL(*this, S.getCond()); if (S.getInit()) EmitStmt(S.getInit()); diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp index 180a0125023ee7..ac5baf72a65c90 100644 --- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp +++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp @@ -4358,21 +4358,21 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp, for (unsigned i = 0; i < InnerBlockDeclRefs.size(); i++) { DeclRefExpr *Exp = InnerBlockDeclRefs[i]; ValueDecl *VD = Exp->getDecl(); - if (!VD->hasAttr() && !BlockByCopyDeclsPtrSet.count(VD)) { + if (!VD->hasAttr() && + BlockByCopyDeclsPtrSet.insert(VD).second) { // We need to save the copied-in variables in nested // blocks because it is needed at the end for some of the API // generations. See SynthesizeBlockLiterals routine. InnerDeclRefs.push_back(Exp); countOfInnerDecls++; BlockDeclRefs.push_back(Exp); - BlockByCopyDeclsPtrSet.insert(VD); BlockByCopyDecls.push_back(VD); } - if (VD->hasAttr() && !BlockByRefDeclsPtrSet.count(VD)) { + if (VD->hasAttr() && + BlockByRefDeclsPtrSet.insert(VD).second) { InnerDeclRefs.push_back(Exp); countOfInnerDecls++; BlockDeclRefs.push_back(Exp); - BlockByRefDeclsPtrSet.insert(VD); BlockByRefDecls.push_back(VD); } } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index a6c3613cb8b528..d450ac9aeda7fc 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -8632,6 +8632,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization( return SkipBody->Previous; Specialization->setInvalidDecl(Invalid); + inferGslOwnerPointerAttribute(Specialization); return Specialization; } diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp index d8c52941b19366..a76639bb86b208 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp @@ -337,6 +337,10 @@ static std::optional printReferrer(const MemRegion *Referrer) { // warn_bind_ref_member_to_parameter or // warn_init_ptr_member_to_parameter_addr return std::nullopt; + } else if (isa(Referrer)) { + // Skip alloca() regions, they indicate advanced memory management + // and higher likelihood of CSA false positives. + return std::nullopt; } else { assert(false && "Unexpected referrer region type."); return std::nullopt; diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 6cefbba307215a..d62f12b63eee8e 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -241,12 +241,10 @@ namespace std { -/// FIXME: The new interpreter produces the wrong diagnostic. namespace PlacementNew { constexpr int foo() { // both-error {{never produces a constant expression}} char c[sizeof(int)]; - new (c) int{12}; // ref-note {{this placement new expression is not supported in constant expressions before C++2c}} \ - // expected-note {{subexpression not valid in a constant expression}} + new (c) int{12}; // both-note {{this placement new expression is not supported in constant expressions before C++2c}} return 0; } } @@ -305,31 +303,28 @@ namespace placement_new_delete { } static_assert(ok()); - /// FIXME: Diagnosting placement new. constexpr bool bad(int which) { switch (which) { case 0: - delete new (placement_new_arg{}) int; // ref-note {{this placement new expression is not supported in constant expressions}} \ - // expected-note {{subexpression not valid in a constant expression}} + delete new (placement_new_arg{}) int; // both-note {{this placement new expression is not supported in constant expressions}} break; case 1: - delete new ClassSpecificNew; // ref-note {{call to class-specific 'operator new'}} + delete new ClassSpecificNew; // both-note {{call to class-specific 'operator new'}} break; case 2: - delete new ClassSpecificDelete; // ref-note {{call to class-specific 'operator delete'}} + delete new ClassSpecificDelete; // both-note {{call to class-specific 'operator delete'}} break; case 3: - delete new DestroyingDelete; // ref-note {{call to class-specific 'operator delete'}} + delete new DestroyingDelete; // both-note {{call to class-specific 'operator delete'}} break; case 4: // FIXME: This technically follows the standard's rules, but it seems // unreasonable to expect implementations to support this. - delete new (std::align_val_t{64}) Overaligned; // ref-note {{this placement new expression is not supported in constant expressions}} \ - // expected-note {{subexpression not valid in a constant expression}} + delete new (std::align_val_t{64}) Overaligned; // both-note {{this placement new expression is not supported in constant expressions}} break; } @@ -337,9 +332,9 @@ namespace placement_new_delete { } static_assert(bad(0)); // both-error {{constant expression}} \ // both-note {{in call}} - static_assert(bad(1)); // ref-error {{constant expression}} ref-note {{in call}} - static_assert(bad(2)); // ref-error {{constant expression}} ref-note {{in call}} - static_assert(bad(3)); // ref-error {{constant expression}} ref-note {{in call}} + static_assert(bad(1)); // both-error {{constant expression}} both-note {{in call}} + static_assert(bad(2)); // both-error {{constant expression}} both-note {{in call}} + static_assert(bad(3)); // both-error {{constant expression}} both-note {{in call}} static_assert(bad(4)); // both-error {{constant expression}} \ // both-note {{in call}} } @@ -586,7 +581,6 @@ constexpr void use_after_free_2() { // both-error {{never produces a constant ex p->f(); // both-note {{member call on heap allocated object that has been deleted}} } - /// std::allocator definition namespace std { using size_t = decltype(sizeof(0)); @@ -758,6 +752,18 @@ namespace Limits { #endif } +/// Just test that we reject placement-new expressions before C++2c. +/// Tests for successful expressions are in placement-new.cpp +namespace Placement { + consteval auto ok1() { // both-error {{never produces a constant expression}} + bool b; + new (&b) bool(true); // both-note 2{{this placement new expression is not supported in constant expressions before C++2c}} + return b; + } + static_assert(ok1()); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp new file mode 100644 index 00000000000000..9e86217c5fbf36 --- /dev/null +++ b/clang/test/AST/ByteCode/placement-new.cpp @@ -0,0 +1,219 @@ +// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -verify=ref,both %s + +namespace std { + using size_t = decltype(sizeof(0)); +} + +void *operator new(std::size_t, void *p) { return p; } +void* operator new[] (std::size_t, void* p) {return p;} + + +consteval auto ok1() { + bool b; + new (&b) bool(true); + return b; +} +static_assert(ok1()); + +consteval auto ok2() { + int b; + new (&b) int(12); + return b; +} +static_assert(ok2() == 12); + + +consteval auto ok3() { + float b; + new (&b) float(12.0); + return b; +} +static_assert(ok3() == 12.0); + + +consteval auto ok4() { + _BitInt(11) b; + new (&b) _BitInt(11)(37); + return b; +} +static_assert(ok4() == 37); + +/// FIXME: Broken in both interpreters. +#if 0 +consteval int ok5() { + int i; + new (&i) int[1]{1}; // expected-note {{assignment to dereferenced one-past-the-end pointer}} + return i; +} +static_assert(ok5() == 1); // expected-error {{not an integral constant expression}} \ + // expected-note {{in call to}} +#endif + +/// FIXME: Crashes the current interpreter. +#if 0 +consteval int ok6() { + int i[2]; + new (&i) int(100); + return i[0]; +} +static_assert(ok6() == 100); +#endif + +consteval int ok6() { + int i[2]; + new (i) int(100); + new (i + 1) int(200); + return i[0] + i[1]; +} +static_assert(ok6() == 300); + + +consteval auto fail1() { + int b; + new (&b) float(1.0); // both-note {{placement new would change type of storage from 'int' to 'float'}} + return b; +} +static_assert(fail1() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + +consteval int fail2() { + int i; + new (static_cast(&i)) float(0); // both-note {{placement new would change type of storage from 'int' to 'float'}} + return 0; +} +static_assert(fail2() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + +consteval int indeterminate() { + int * indeterminate; + new (indeterminate) int(0); // both-note {{read of uninitialized object is not allowed in a constant expression}} + return 0; +} +static_assert(indeterminate() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + +consteval int array1() { + int i[2]; + new (&i) int[]{1,2}; + return i[0] + i[1]; +} +static_assert(array1() == 3); + +consteval int array2() { + int i[2]; + new (static_cast(&i)) int[]{1,2}; + return i[0] + i[1]; +} +static_assert(array2() == 3); + +consteval int array3() { + int i[1]; + new (&i) int[2]; // both-note {{placement new would change type of storage from 'int[1]' to 'int[2]'}} + return 0; +} +static_assert(array3() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + +consteval int array4() { + int i[2]; + new (&i) int[]{12}; + return i[0]; +} +static_assert(array4() == 12); + +constexpr int *intptr() { + return new int; +} +constexpr bool yay() { + int *ptr = new (intptr()) int(42); + bool ret = *ptr == 42; + delete ptr; + return ret; +} +static_assert(yay()); + + +constexpr bool blah() { + int *ptr = new (intptr()) int[3]{ 1, 2, 3 }; // both-note {{placement new would change type of storage from 'int' to 'int[3]'}} + bool ret = ptr[0] == 1 && ptr[1] == 2 && ptr[2] == 3; + delete [] ptr; + return ret; +} +static_assert(blah()); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'blah()'}} + + +constexpr int *get_indeterminate() { + int *evil; + return evil; // both-note {{read of uninitialized object is not allowed in a constant expression}} +} + +constexpr bool bleh() { + int *ptr = new (get_indeterminate()) int; // both-note {{in call to 'get_indeterminate()'}} + return true; +} +static_assert(bleh()); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'bleh()'}} + +namespace records { + class S { + public: + float f; + }; + + constexpr bool record1() { + S s(13); + new (&s) S(42); + return s.f == 42; + } + static_assert(record1()); + + S GlobalS; + constexpr bool record2() { + new (&GlobalS) S(42); // both-note {{a constant expression cannot modify an object that is visible outside that expression}} + return GlobalS.f == 42; + } + static_assert(record2()); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + + + constexpr bool record3() { + S ss[3]; + + new (&ss) S[]{{1}, {2}, {3}}; + + return ss[0].f == 1 && ss[1].f == 2 && ss[2].f == 3; + } + static_assert(record3()); + + struct F { + float f; + }; + struct R { + F f; + int a; + }; + constexpr bool record4() { + R r; + new (&r.f) F{42.0}; + new (&r.a) int(12); + + return r.f.f == 42.0 && r.a == 12; + } + static_assert(record4()); + + /// Destructor is NOT called. + struct A { + bool b; + constexpr ~A() { if (b) throw; } + }; + + constexpr int foo() { + A a; + new (&a) A(true); + new (&a) A(false); + return 0; + } + static_assert(foo() == 0); +} diff --git a/clang/test/Analysis/stack-addr-ps.cpp b/clang/test/Analysis/stack-addr-ps.cpp index 35f38fbbfbefdc..73e9dbeca460f6 100644 --- a/clang/test/Analysis/stack-addr-ps.cpp +++ b/clang/test/Analysis/stack-addr-ps.cpp @@ -1,10 +1,23 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s -Wno-undefined-bool-conversion +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=core,debug.ExprInspection \ +// RUN: -verify %s \ +// RUN: -Wno-undefined-bool-conversion +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=core,debug.ExprInspection,unix.Malloc \ +// RUN: -verify %s \ +// RUN: -Wno-undefined-bool-conversion +// unix.Malloc is necessary to model __builtin_alloca, +// which could trigger an "unexpected region" bug in StackAddrEscapeChecker. typedef __INTPTR_TYPE__ intptr_t; template void clang_analyzer_dump(T x); +using size_t = decltype(sizeof(int)); +void * malloc(size_t size); +void free(void*); + const int& g() { int s; return s; // expected-warning{{Address of stack memory associated with local variable 's' returned}} expected-warning{{reference to stack memory associated with local variable 's' returned}} @@ -846,3 +859,21 @@ void top(char **p) { foo(); // no-warning FIXME: p binding is reclaimed before the function end } } // namespace early_reclaim_dead_limitation + +namespace alloca_region_pointer { +void callee(char **pptr) { + char local; + *pptr = &local; +} // no crash + +void top_alloca_no_crash_fn() { + char **pptr = (char**)__builtin_alloca(sizeof(char*)); + callee(pptr); +} + +void top_malloc_no_crash_fn() { + char **pptr = (char**)malloc(sizeof(char*)); + callee(pptr); + free(pptr); +} +} // namespace alloca_region_pointer diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c index 8a2bc93dd6cd0a..cdbfdd6b7975ad 100644 --- a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c +++ b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c @@ -99,7 +99,7 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec // CHECK-LE-NOOPT-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[VC1_ADDR]], align 16 // CHECK-LE-NOOPT-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[VC2_ADDR]], align 16 // CHECK-LE-NOOPT-NEXT: [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP5]], <16 x i8> [[TMP4]]) -// CHECK-LE-NOOPT-NEXT: store <256 x i1> [[TMP6]], ptr [[RES]], align 64 +// CHECK-LE-NOOPT-NEXT: store <256 x i1> [[TMP6]], ptr [[RES]], align 32 // CHECK-LE-NOOPT-NEXT: [[TMP7:%.*]] = load <256 x i1>, ptr [[RES]], align 32 // CHECK-LE-NOOPT-NEXT: [[TMP8:%.*]] = load ptr, ptr [[RESP_ADDR]], align 8 // CHECK-LE-NOOPT-NEXT: store <256 x i1> [[TMP7]], ptr [[TMP8]], align 32 diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c index 39c040967dc0c3..b18bb3ad050aca 100644 --- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c +++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c @@ -85,11 +85,11 @@ void testVQLocal(int *ptr, vector unsigned char vc) { // CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-NEXT: [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) -// CHECK-NEXT: store <256 x i1> [[TMP5]], ptr [[VP2]], align 64 +// CHECK-NEXT: store <256 x i1> [[TMP5]], ptr [[VP2]], align 32 // CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-NEXT: [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP6]]) -// CHECK-NEXT: store <256 x i1> [[TMP8]], ptr [[VP2]], align 64 +// CHECK-NEXT: store <256 x i1> [[TMP8]], ptr [[VP2]], align 32 // CHECK-NEXT: [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32 // CHECK-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-NEXT: [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]]) @@ -118,11 +118,11 @@ void testVQLocal(int *ptr, vector unsigned char vc) { // CHECK-BE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-BE-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-BE-NEXT: [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) -// CHECK-BE-NEXT: store <256 x i1> [[TMP5]], ptr [[VP2]], align 64 +// CHECK-BE-NEXT: store <256 x i1> [[TMP5]], ptr [[VP2]], align 32 // CHECK-BE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-BE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-BE-NEXT: [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]]) -// CHECK-BE-NEXT: store <256 x i1> [[TMP8]], ptr [[VP2]], align 64 +// CHECK-BE-NEXT: store <256 x i1> [[TMP8]], ptr [[VP2]], align 32 // CHECK-BE-NEXT: [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32 // CHECK-BE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16 // CHECK-BE-NEXT: [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]]) diff --git a/clang/test/CodeGenCXX/debug-info-line-if-2.cpp b/clang/test/CodeGenCXX/debug-info-line-if-2.cpp new file mode 100644 index 00000000000000..8ab96a7daf4c47 --- /dev/null +++ b/clang/test/CodeGenCXX/debug-info-line-if-2.cpp @@ -0,0 +1,45 @@ +// RUN: %clang_cc1 -debug-info-kind=limited -gno-column-info -triple=x86_64-pc-linux -emit-llvm %s -o - | FileCheck %s + +// The important thing is that the compare and the conditional branch have +// locs with the same scope (the lexical block for the 'if'). By turning off +// column info, they end up with the same !dbg record, which halves the number +// of checks to verify the scope. + +int c = 2; + +int f() { +#line 100 + if (int a = 5; a > c) + return 1; + return 0; +} +// CHECK-LABEL: define {{.*}} @_Z1fv() +// CHECK: = icmp {{.*}} !dbg [[F_CMP:![0-9]+]] +// CHECK-NEXT: br i1 {{.*}} !dbg [[F_CMP]] + +int g() { +#line 200 + if (int a = f()) + return 2; + return 3; +} +// CHECK-LABEL: define {{.*}} @_Z1gv() +// CHECK: = icmp {{.*}} !dbg [[G_CMP:![0-9]+]] +// CHECK-NEXT: br i1 {{.*}} !dbg [[G_CMP]] + +int h() { +#line 300 + if (c > 3) + return 4; + return 5; +} +// CHECK-LABEL: define {{.*}} @_Z1hv() +// CHECK: = icmp {{.*}} !dbg [[H_CMP:![0-9]+]] +// CHECK-NEXT: br i1 {{.*}} !dbg [[H_CMP]] + +// CHECK-DAG: [[F_CMP]] = !DILocation(line: 100, scope: [[F_SCOPE:![0-9]+]] +// CHECK-DAG: [[F_SCOPE]] = distinct !DILexicalBlock({{.*}} line: 100) +// CHECK-DAG: [[G_CMP]] = !DILocation(line: 200, scope: [[G_SCOPE:![0-9]+]] +// CHECK-DAG: [[G_SCOPE]] = distinct !DILexicalBlock({{.*}} line: 200) +// CHECK-DAG: [[H_CMP]] = !DILocation(line: 300, scope: [[H_SCOPE:![0-9]+]] +// CHECK-DAG: [[H_SCOPE]] = distinct !DILexicalBlock({{.*}} line: 300) diff --git a/clang/test/Sema/riscv-asm.c b/clang/test/Sema/riscv-asm.c index 82664c013175d4..69ba3be3345d5a 100644 --- a/clang/test/Sema/riscv-asm.c +++ b/clang/test/Sema/riscv-asm.c @@ -1,8 +1,6 @@ // RUN: %clang_cc1 %s -triple riscv32 -verify -fsyntax-only // RUN: %clang_cc1 %s -triple riscv64 -verify -fsyntax-only -// expected-no-diagnostics - void i (void) { asm volatile ("" ::: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"); asm volatile ("" ::: "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15"); @@ -26,3 +24,18 @@ void f (void) { asm volatile ("" ::: "fa6", "fa7", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7"); asm volatile ("" ::: "fs8", "fs9", "fs10", "fs11", "ft8", "ft9", "ft10", "ft11"); } + +register char i1 __asm__ ("x1"); // expected-error {{size of register 'x1' does not match variable size}} +#if __riscv_xlen == 32 +register long long ll2 __asm__ ("x2"); // expected-error {{size of register 'x2' does not match variable size}} +register int i2 __asm__ ("x3"); +#endif +register long l3 __asm__ ("x4"); +register long ra __asm__ ("ra"); +register long sp __asm__ ("sp"); +register int *gp __asm__ ("gp"); +register char *tp __asm__ ("tp"); +register long a7 __asm__ ("a7"); +register long s11 __asm__ ("s11"); +register long t5 __asm__ ("t5"); +register long* f1 __asm__ ("f1"); // expected-error {{register 'f1' unsuitable for global register variables on this target}} diff --git a/clang/test/SemaCXX/attr-gsl-owner-pointer-std.cpp b/clang/test/SemaCXX/attr-gsl-owner-pointer-std.cpp index 352e1e473580a6..8fb4cc7621fedf 100644 --- a/clang/test/SemaCXX/attr-gsl-owner-pointer-std.cpp +++ b/clang/test/SemaCXX/attr-gsl-owner-pointer-std.cpp @@ -27,6 +27,11 @@ class vector { static_assert(sizeof(vector), ""); // Force instantiation. static_assert(sizeof(vector::iterator), ""); // Force instantiation. +template <> +class vector {}; +// CHECK: ClassTemplateSpecializationDecl {{.*}} vector +// CHECK: OwnerAttr {{.*}} + // If std::container::iterator is a using declaration, attributes are inferred // for the underlying class. template @@ -173,6 +178,18 @@ class reference_wrapper; class some_unknown_type; // CHECK: CXXRecordDecl {{.*}} some_unknown_type +using size_t = unsigned; +inline constexpr size_t dynamic_extent = -1; +template +class span; +// CHECK: CXXRecordDecl {{.*}} span +// CHECK: PointerAttr {{.*}} + + +template +struct span<_Tp, dynamic_extent> {}; +// CHECK: ClassTemplatePartialSpecializationDecl {{.*}} span +// CHECK: PointerAttr {{.*}} } // namespace std namespace user { diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp index d186d1aaa8d93e..9cc7214aef85c7 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp @@ -64,13 +64,15 @@ INTERCEPTOR(int, open, const char *path, int oflag, ...) { // O_NONBLOCK __rtsan_notify_intercepted_call("open"); - va_list args; - va_start(args, oflag); - const mode_t mode = va_arg(args, int); - va_end(args); + if (OpenReadsVaArgs(oflag)) { + va_list args; + va_start(args, oflag); + const mode_t mode = va_arg(args, int); + va_end(args); + return REAL(open)(path, oflag, mode); + } - const int result = REAL(open)(path, oflag, mode); - return result; + return REAL(open)(path, oflag); } #if SANITIZER_INTERCEPT_OPEN64 @@ -79,13 +81,15 @@ INTERCEPTOR(int, open64, const char *path, int oflag, ...) { // O_NONBLOCK __rtsan_notify_intercepted_call("open64"); - va_list args; - va_start(args, oflag); - const mode_t mode = va_arg(args, int); - va_end(args); + if (OpenReadsVaArgs(oflag)) { + va_list args; + va_start(args, oflag); + const mode_t mode = va_arg(args, int); + va_end(args); + return REAL(open64)(path, oflag, mode); + } - const int result = REAL(open64)(path, oflag, mode); - return result; + return REAL(open64)(path, oflag); } #define RTSAN_MAYBE_INTERCEPT_OPEN64 INTERCEPT_FUNCTION(open64) #else @@ -97,13 +101,15 @@ INTERCEPTOR(int, openat, int fd, const char *path, int oflag, ...) { // O_NONBLOCK __rtsan_notify_intercepted_call("openat"); - va_list args; - va_start(args, oflag); - mode_t mode = va_arg(args, int); - va_end(args); + if (OpenReadsVaArgs(oflag)) { + va_list args; + va_start(args, oflag); + const mode_t mode = va_arg(args, int); + va_end(args); + return REAL(openat)(fd, path, oflag, mode); + } - const int result = REAL(openat)(fd, path, oflag, mode); - return result; + return REAL(openat)(fd, path, oflag); } #if SANITIZER_INTERCEPT_OPENAT64 @@ -112,13 +118,15 @@ INTERCEPTOR(int, openat64, int fd, const char *path, int oflag, ...) { // O_NONBLOCK __rtsan_notify_intercepted_call("openat64"); - va_list args; - va_start(args, oflag); - mode_t mode = va_arg(args, int); - va_end(args); + if (OpenReadsVaArgs(oflag)) { + va_list args; + va_start(args, oflag); + const mode_t mode = va_arg(args, int); + va_end(args); + return REAL(openat64)(fd, path, oflag, mode); + } - const int result = REAL(openat64)(fd, path, oflag, mode); - return result; + return REAL(openat64)(fd, path, oflag); } #define RTSAN_MAYBE_INTERCEPT_OPENAT64 INTERCEPT_FUNCTION(openat64) #else diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp index 7d7d5754319947..69af6465a62c2d 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp @@ -353,7 +353,15 @@ bool ShouldMockFailureToOpen(const char *path) { internal_strncmp(path, "/proc/", 6) == 0; } -#if SANITIZER_LINUX && !SANITIZER_ANDROID && !SANITIZER_GO +bool OpenReadsVaArgs(int oflag) { +# ifdef O_TMPFILE + return (oflag & (O_CREAT | O_TMPFILE)) != 0; +# else + return (oflag & O_CREAT) != 0; +# endif +} + +# if SANITIZER_LINUX && !SANITIZER_ANDROID && !SANITIZER_GO int GetNamedMappingFd(const char *name, uptr size, int *flags) { if (!common_flags()->decorate_proc_maps || !name) return -1; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h index d0954f77e97136..1f0795caa420c7 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h @@ -108,6 +108,7 @@ bool IsStateDetached(int state); fd_t ReserveStandardFds(fd_t fd); bool ShouldMockFailureToOpen(const char *path); +bool OpenReadsVaArgs(int oflag); // Create a non-file mapping with a given /proc/self/maps name. uptr MmapNamed(void *addr, uptr length, int prot, int flags, const char *name); diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp index 53c876f4f9175f..423d97e94d81ae 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp @@ -1680,13 +1680,23 @@ TSAN_INTERCEPTOR(int, fstat64, int fd, void *buf) { #endif TSAN_INTERCEPTOR(int, open, const char *name, int oflag, ...) { - va_list ap; - va_start(ap, oflag); - mode_t mode = va_arg(ap, int); - va_end(ap); + mode_t mode = 0; + if (OpenReadsVaArgs(oflag)) { + va_list ap; + va_start(ap, oflag); + mode = va_arg(ap, int); + va_end(ap); + } + SCOPED_TSAN_INTERCEPTOR(open, name, oflag, mode); READ_STRING(thr, pc, name, 0); - int fd = REAL(open)(name, oflag, mode); + + int fd; + if (OpenReadsVaArgs(oflag)) + fd = REAL(open)(name, oflag, mode); + else + fd = REAL(open)(name, oflag); + if (fd >= 0) FdFileCreate(thr, pc, fd); return fd; diff --git a/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test b/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test index ba42433f98c296..0d750185204aed 100644 --- a/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test +++ b/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test @@ -1,8 +1,8 @@ RUN: rm -rf %t && split-file %s %t && cd %t -RUN: %clang_pgogen -fPIC foo.c -c -Xclang -fprofile-instrument-path="default_foo_%m.profraw" -RUN: %clang_pgogen -fPIC foo2.c -c -Xclang -fprofile-instrument-path="default_foo2_%m.profraw" -RUN: %clang_pgogen -shared foo.o -o shr_foo.o %if target={{.*aix.*}} %{ -bcdtors:mbr %} -RUN: %clang_pgogen -shared foo2.o -o shr_foo2.o +RUN: %clang_pgogen -fprofile-update=atomic -fPIC foo.c -c -Xclang -fprofile-instrument-path="default_foo_%m.profraw" +RUN: %clang_pgogen -fprofile-update=atomic -fPIC foo2.c -c -Xclang -fprofile-instrument-path="default_foo2_%m.profraw" +RUN: %clang_pgogen -fprofile-update=atomic -shared foo.o -o shr_foo.o %if target={{.*aix.*}} %{ -bcdtors:mbr %} +RUN: %clang_pgogen -fprofile-update=atomic -shared foo2.o -o shr_foo2.o RUN: %clang_pgogen common.c -c diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake index 63145fe709dda0..862c7ecbd7fdf4 100644 --- a/libc/cmake/modules/CheckCompilerFeatures.cmake +++ b/libc/cmake/modules/CheckCompilerFeatures.cmake @@ -10,6 +10,7 @@ set( "builtin_round" "builtin_roundeven" "float16" + "float16_conversion" "float128" "fixed_point" ) @@ -61,15 +62,21 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) set(link_options "") if(${feature} STREQUAL "fixed_point") list(APPEND compile_options "-ffixed-point") - elseif(${feature} MATCHES "^builtin_") + elseif(${feature} MATCHES "^builtin_" OR + ${feature} STREQUAL "float16_conversion") set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT}) set(link_options -nostdlib) - # The compiler might handle calls to rounding builtins by generating calls - # to the respective libc math functions, in which case we cannot use these + # The compiler might handle calls to math builtins by generating calls to + # the respective libc math functions, in which case we cannot use these # builtins in our implementations of these functions. We check that this is # not the case by trying to link an executable, since linking would fail due # to unresolved references with -nostdlib if calls to libc functions were # generated. + # + # We also had issues with soft-float float16 conversion functions using both + # compiler-rt and libgcc, so we also check whether we can convert from and + # to float16 without calls to compiler runtime functions by trying to link + # an executable with -nostdlib. set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE) endif() @@ -97,6 +104,8 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) list(APPEND AVAILABLE_COMPILER_FEATURES ${feature}) if(${feature} STREQUAL "float16") set(LIBC_TYPES_HAS_FLOAT16 TRUE) + elseif(${feature} STREQUAL "float16_conversion") + add_compile_definitions(__LIBC_USE_FLOAT16_CONVERSION) elseif(${feature} STREQUAL "float128") set(LIBC_TYPES_HAS_FLOAT128 TRUE) elseif(${feature} STREQUAL "fixed_point") @@ -115,6 +124,10 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) endif() endforeach() +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT}) +set(link_options "") + message(STATUS "Compiler features available: ${AVAILABLE_COMPILER_FEATURES}") ### Compiler Feature Detection ### diff --git a/libc/cmake/modules/compiler_features/check_float16_conversion.cpp b/libc/cmake/modules/compiler_features/check_float16_conversion.cpp new file mode 100644 index 00000000000000..09ac8e9c8bc9d6 --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_float16_conversion.cpp @@ -0,0 +1,30 @@ +#include "include/llvm-libc-macros/float16-macros.h" +#include "include/llvm-libc-types/float128.h" + +#ifndef LIBC_TYPES_HAS_FLOAT16 +#error unsupported +#endif + +_Float16 cvt_from_float(float x) { return static_cast<_Float16>(x); } + +_Float16 cvt_from_double(double x) { return static_cast<_Float16>(x); } + +_Float16 cvt_from_long_double(long double x) { + return static_cast<_Float16>(x); +} + +#ifdef LIBC_TYPES_HAS_FLOAT128 +_Float16 cvt_from_float128(float128 x) { return static_cast<_Float16>(x); } +#endif + +float cvt_to_float(_Float16 x) { return x; } + +double cvt_to_double(_Float16 x) { return x; } + +long double cvt_to_long_double(_Float16 x) { return x; } + +#ifdef LIBC_TYPES_HAS_FLOAT128 +float128 cvt_to_float128(_Float16 x) { return x; } +#endif + +extern "C" void _start() {} diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt index ea1e0e8b39d101..522b4afefd48d6 100644 --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -92,11 +92,14 @@ add_header_library( HDRS except_value_utils.h DEPENDS + .cast .fp_bits .fenv_impl .rounding_mode libc.src.__support.CPP.optional libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types ) @@ -175,9 +178,13 @@ add_header_library( .fenv_impl .fp_bits .multiply_add + .rounding_mode + libc.hdr.errno_macros + libc.hdr.fenv_macros libc.src.__support.CPP.type_traits libc.src.__support.big_int libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types ) add_header_library( @@ -217,18 +224,32 @@ add_header_library( HDRS ManipulationFunctions.h DEPENDS + .cast + .dyadic_float .fenv_impl .fp_bits - .dyadic_float .nearest_integer_operations .normal_float libc.hdr.math_macros + libc.src.errno.errno + libc.src.__support.common libc.src.__support.CPP.bit libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits - libc.src.__support.common libc.src.__support.macros.optimization - libc.src.errno.errno +) + +add_header_library( + cast + HDRS + cast.h + DEPENDS + .dyadic_float + .fp_bits + libc.hdr.fenv_macros + libc.src.__support.CPP.algorithm + libc.src.__support.CPP.type_traits + libc.src.__support.macros.properties.types ) add_subdirectory(generic) diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h index a14f355789999a..66bfe2aa377f99 100644 --- a/libc/src/__support/FPUtil/ManipulationFunctions.h +++ b/libc/src/__support/FPUtil/ManipulationFunctions.h @@ -12,6 +12,7 @@ #include "FPBits.h" #include "NearestIntegerOperations.h" #include "NormalFloat.h" +#include "cast.h" #include "dyadic_float.h" #include "rounding_mode.h" @@ -192,7 +193,8 @@ ldexp(T x, U exp) { // For all other values, NormalFloat to T conversion handles it the right way. DyadicFloat::STORAGE_LEN> normal(bits.get_val()); normal.exponent += static_cast(exp); - return static_cast(normal); + // TODO: Add tests for exceptions. + return normal.template as(); } template to_bits(to); if (to_bits.is_nan()) - return static_cast(to); + return cast(to); // NOTE: This would work only if `U` has a greater or equal precision than // `T`. Otherwise `from` could loose its precision and the following statement // could incorrectly evaluate to `true`. - if (static_cast(from) == to) - return static_cast(to); + if (cast(from) == to) + return cast(to); using StorageType = typename FPBits::StorageType; if (from != T(0)) { - if ((static_cast(from) < to) == (from > T(0))) { + if ((cast(from) < to) == (from > T(0))) { from_bits = FPBits(StorageType(from_bits.uintval() + 1)); } else { from_bits = FPBits(StorageType(from_bits.uintval() - 1)); diff --git a/libc/src/__support/FPUtil/cast.h b/libc/src/__support/FPUtil/cast.h new file mode 100644 index 00000000000000..126f3852137b77 --- /dev/null +++ b/libc/src/__support/FPUtil/cast.h @@ -0,0 +1,65 @@ +//===-- Conversion between floating-point types -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_CAST_H +#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_CAST_H + +#include "FPBits.h" +#include "dyadic_float.h" +#include "hdr/fenv_macros.h" +#include "src/__support/CPP/algorithm.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE::fputil { + +template +LIBC_INLINE constexpr cpp::enable_if_t && + cpp::is_floating_point_v, + OutType> +cast(InType x) { +#if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION) + if constexpr (cpp::is_same_v || + cpp::is_same_v) { + using InFPBits = FPBits; + using InStorageType = typename InFPBits::StorageType; + using OutFPBits = FPBits; + using OutStorageType = typename OutFPBits::StorageType; + + InFPBits x_bits(x); + + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + raise_except_if_required(FE_INVALID); + return OutFPBits::quiet_nan().get_val(); + } + + InStorageType x_mant = x_bits.get_mantissa(); + if (InFPBits::FRACTION_LEN > OutFPBits::FRACTION_LEN) + x_mant >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN; + return OutFPBits::quiet_nan(x_bits.sign(), + static_cast(x_mant)) + .get_val(); + } + + if (x_bits.is_inf()) + return OutFPBits::inf(x_bits.sign()).get_val(); + + constexpr size_t MAX_FRACTION_LEN = + cpp::max(OutFPBits::FRACTION_LEN, InFPBits::FRACTION_LEN); + DyadicFloat xd(x); + return xd.template as(); + } +#endif + + return static_cast(x); +} + +} // namespace LIBC_NAMESPACE::fputil + +#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_CAST_H diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h index 86346a47b35a34..165ffc7c922025 100644 --- a/libc/src/__support/FPUtil/dyadic_float.h +++ b/libc/src/__support/FPUtil/dyadic_float.h @@ -11,11 +11,15 @@ #include "FEnvImpl.h" #include "FPBits.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" #include "multiply_add.h" +#include "rounding_mode.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/big_int.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/properties/types.h" #include @@ -97,13 +101,120 @@ template struct DyadicFloat { return exponent + (Bits - 1); } - // Assume that it is already normalized. - // Output is rounded correctly with respect to the current rounding mode. +#ifdef LIBC_TYPES_HAS_FLOAT16 + template + LIBC_INLINE constexpr cpp::enable_if_t< + cpp::is_floating_point_v && (FPBits::FRACTION_LEN < Bits), T> + generic_as() const { + using FPBits = FPBits; + using StorageType = typename FPBits::StorageType; + + constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN; + + if (mantissa == 0) + return FPBits::zero(sign).get_val(); + + int unbiased_exp = get_unbiased_exponent(); + + if (unbiased_exp + FPBits::EXP_BIAS >= FPBits::MAX_BIASED_EXPONENT) { + if constexpr (ShouldSignalExceptions) { + set_errno_if_required(ERANGE); + raise_except_if_required(FE_OVERFLOW | FE_INEXACT); + } + + switch (quick_get_round()) { + case FE_TONEAREST: + return FPBits::inf(sign).get_val(); + case FE_TOWARDZERO: + return FPBits::max_normal(sign).get_val(); + case FE_DOWNWARD: + if (sign.is_pos()) + return FPBits::max_normal(Sign::POS).get_val(); + return FPBits::inf(Sign::NEG).get_val(); + case FE_UPWARD: + if (sign.is_neg()) + return FPBits::max_normal(Sign::NEG).get_val(); + return FPBits::inf(Sign::POS).get_val(); + default: + __builtin_unreachable(); + } + } + + StorageType out_biased_exp = 0; + StorageType out_mantissa = 0; + bool round = false; + bool sticky = false; + bool underflow = false; + + if (unbiased_exp < -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { + sticky = true; + underflow = true; + } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { + round = true; + MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1; + sticky = (mantissa & sticky_mask) != 0; + } else { + int extra_fraction_len = EXTRA_FRACTION_LEN; + + if (unbiased_exp < 1 - FPBits::EXP_BIAS) { + underflow = true; + extra_fraction_len += 1 - FPBits::EXP_BIAS - unbiased_exp; + } else { + out_biased_exp = + static_cast(unbiased_exp + FPBits::EXP_BIAS); + } + + MantissaType round_mask = MantissaType(1) << (extra_fraction_len - 1); + round = (mantissa & round_mask) != 0; + MantissaType sticky_mask = round_mask - 1; + sticky = (mantissa & sticky_mask) != 0; + + out_mantissa = static_cast(mantissa >> extra_fraction_len); + } + + bool lsb = (out_mantissa & 1) != 0; + + StorageType result = + FPBits::create_value(sign, out_biased_exp, out_mantissa).uintval(); + + switch (quick_get_round()) { + case FE_TONEAREST: + if (round && (lsb || sticky)) + ++result; + break; + case FE_DOWNWARD: + if (sign.is_neg() && (round || sticky)) + ++result; + break; + case FE_UPWARD: + if (sign.is_pos() && (round || sticky)) + ++result; + break; + default: + break; + } + + if (ShouldSignalExceptions && (round || sticky)) { + int excepts = FE_INEXACT; + if (FPBits(result).is_inf()) { + set_errno_if_required(ERANGE); + excepts |= FE_OVERFLOW; + } else if (underflow) { + set_errno_if_required(ERANGE); + excepts |= FE_UNDERFLOW; + } + raise_except_if_required(excepts); + } + + return FPBits(result).get_val(); + } +#endif // LIBC_TYPES_HAS_FLOAT16 + template && (FPBits::FRACTION_LEN < Bits), void>> - LIBC_INLINE constexpr T as() const { + LIBC_INLINE constexpr T fast_as() const { if (LIBC_UNLIKELY(mantissa.is_zero())) return FPBits::zero(sign).get_val(); @@ -224,6 +335,20 @@ template struct DyadicFloat { return r; } + // Assume that it is already normalized. + // Output is rounded correctly with respect to the current rounding mode. + template && + (FPBits::FRACTION_LEN < Bits), + void>> + LIBC_INLINE constexpr T as() const { +#if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION) + if constexpr (cpp::is_same_v) + return generic_as(); +#endif + return fast_as(); + } + template && (FPBits::FRACTION_LEN < Bits), diff --git a/libc/src/__support/FPUtil/except_value_utils.h b/libc/src/__support/FPUtil/except_value_utils.h index b9f54aa24e3a22..f8e4e92d3e1fb3 100644 --- a/libc/src/__support/FPUtil/except_value_utils.h +++ b/libc/src/__support/FPUtil/except_value_utils.h @@ -11,10 +11,13 @@ #include "FEnvImpl.h" #include "FPBits.h" +#include "cast.h" #include "rounding_mode.h" #include "src/__support/CPP/optional.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -113,6 +116,21 @@ template LIBC_INLINE T round_result_slightly_up(T value_rn) { return tmp; } +#if defined(LIBC_TYPES_HAS_FLOAT16) && \ + !defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) +template <> LIBC_INLINE float16 round_result_slightly_down(float16 value_rn) { + volatile float tmp = value_rn; + tmp -= FPBits::min_normal().get_val(); + return cast(tmp); +} + +template <> LIBC_INLINE float16 round_result_slightly_up(float16 value_rn) { + volatile float tmp = value_rn; + tmp += FPBits::min_normal().get_val(); + return cast(tmp); +} +#endif + } // namespace fputil } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt index 43096aa529fc37..60434d6f6f11ab 100644 --- a/libc/src/__support/FPUtil/generic/CMakeLists.txt +++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt @@ -8,6 +8,7 @@ add_header_library( libc.src.__support.common libc.src.__support.CPP.bit libc.src.__support.CPP.type_traits + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits @@ -21,16 +22,17 @@ add_header_library( FMA.h DEPENDS libc.hdr.fenv_macros + libc.src.__support.big_int libc.src.__support.common libc.src.__support.CPP.bit libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.rounding_mode - libc.src.__support.big_int libc.src.__support.macros.optimization libc.src.__support.uint128 ) @@ -60,9 +62,10 @@ add_header_library( libc.src.__support.CPP.bit libc.src.__support.CPP.type_traits libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.attributes libc.src.__support.macros.optimization diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h index e5683c8ff61ea0..bec312e44b1b10 100644 --- a/libc/src/__support/FPUtil/generic/FMA.h +++ b/libc/src/__support/FPUtil/generic/FMA.h @@ -14,6 +14,7 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/big_int.h" @@ -157,7 +158,7 @@ fma(InType x, InType y, InType z) { } if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0)) - return static_cast(x * y + z); + return cast(x * y + z); int x_exp = 0; int y_exp = 0; @@ -198,7 +199,7 @@ fma(InType x, InType y, InType z) { if (LIBC_UNLIKELY(x_exp == InFPBits::MAX_BIASED_EXPONENT || y_exp == InFPBits::MAX_BIASED_EXPONENT || z_exp == InFPBits::MAX_BIASED_EXPONENT)) - return static_cast(x * y + z); + return cast(x * y + z); // Extract mantissa and append hidden leading bits. InStorageType x_mant = x_bits.get_explicit_mantissa(); diff --git a/libc/src/__support/FPUtil/generic/add_sub.h b/libc/src/__support/FPUtil/generic/add_sub.h index 850db3f83209e6..6bc9dcd23bafad 100644 --- a/libc/src/__support/FPUtil/generic/add_sub.h +++ b/libc/src/__support/FPUtil/generic/add_sub.h @@ -17,6 +17,7 @@ #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/macros/attributes.h" @@ -106,14 +107,14 @@ add_or_sub(InType x, InType y) { volatile InType tmp = y; if constexpr (IsSub) tmp = -tmp; - return static_cast(tmp); + return cast(tmp); } if (y_bits.is_zero()) { volatile InType tmp = y; if constexpr (IsSub) tmp = -tmp; - return static_cast(tmp); + return cast(tmp); } } diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h index 4502cc07d32b31..01af4bb7c90092 100644 --- a/libc/src/__support/FPUtil/generic/sqrt.h +++ b/libc/src/__support/FPUtil/generic/sqrt.h @@ -14,6 +14,7 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" @@ -96,7 +97,7 @@ sqrt(InType x) { // sqrt(-0) = -0 // sqrt(NaN) = NaN // sqrt(-NaN) = -NaN - return static_cast(x); + return cast(x); } else if (bits.is_neg()) { // sqrt(-Inf) = NaN // sqrt(-x) = NaN diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 5a1ee3b8b83c77..d0676d03420c68 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -109,9 +109,10 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS - libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types FLAGS ROUND_OPT ) @@ -672,9 +673,10 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS - libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types FLAGS ROUND_OPT ) @@ -741,9 +743,10 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS - libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types FLAGS ROUND_OPT ) @@ -810,9 +813,10 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS - libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types FLAGS ROUND_OPT ) @@ -881,6 +885,7 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.FPUtil.cast libc.src.__support.macros.properties.cpu_features FLAGS ROUND_OPT @@ -1072,9 +1077,10 @@ add_entrypoint_object( COMPILE_OPTIONS -O3 DEPENDS - libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.macros.properties.cpu_features + libc.src.__support.macros.properties.types FLAGS ROUND_OPT ) @@ -1362,12 +1368,15 @@ add_entrypoint_object( .expxf16 libc.hdr.errno_macros libc.hdr.fenv_macros + libc.src.__support.CPP.array + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.rounding_mode - libc.src.__support.macros.attributes libc.src.__support.macros.optimization COMPILE_OPTIONS -O3 @@ -1442,6 +1451,7 @@ add_entrypoint_object( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.CPP.array + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits @@ -1545,6 +1555,7 @@ add_entrypoint_object( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.CPP.array + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits @@ -1617,6 +1628,7 @@ add_entrypoint_object( .expxf16 libc.hdr.errno_macros libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp index 8af31c6623a02a..9d89efc5311d18 100644 --- a/libc/src/math/generic/ceilf16.cpp +++ b/libc/src/math/generic/ceilf16.cpp @@ -8,6 +8,7 @@ #include "src/math/ceilf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_ceilf(x)); + return fputil::cast(__builtin_ceilf(x)); #else return fputil::ceil(x); #endif diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp index 9959f7450b591f..1c5966c1f1c126 100644 --- a/libc/src/math/generic/exp10f16.cpp +++ b/libc/src/math/generic/exp10f16.cpp @@ -14,6 +14,7 @@ #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/except_value_utils.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/nearest_integer.h" @@ -118,13 +119,13 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) { switch (x_u) { case 0x3c00U: // x = 1.0f16 - return static_cast(10.0); + return fputil::cast(10.0); case 0x4000U: // x = 2.0f16 - return static_cast(100.0); + return fputil::cast(100.0); case 0x4200U: // x = 3.0f16 - return static_cast(1'000.0); + return fputil::cast(1'000.0); case 0x4400U: // x = 4.0f16 - return static_cast(10'000.0); + return fputil::cast(10'000.0); } } @@ -164,7 +165,7 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { // > 1 + x * P; float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, 0x1.04b434p+1f, 0x1.2bcf9ep+0f); - return static_cast(exp2_hi_mid * exp10_lo); + return fputil::cast(exp2_hi_mid * exp10_lo); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/exp2f16.cpp b/libc/src/math/generic/exp2f16.cpp index 66b79567040053..3c4310259b1df9 100644 --- a/libc/src/math/generic/exp2f16.cpp +++ b/libc/src/math/generic/exp2f16.cpp @@ -14,6 +14,7 @@ #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/except_value_utils.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/nearest_integer.h" @@ -121,7 +122,7 @@ LLVM_LIBC_FUNCTION(float16, exp2f16, (float16 x)) { // > 1 + x * P; float exp2_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.62e43p-1f, 0x1.ec0aa6p-3f, 0x1.c6b4a6p-5f); - return static_cast(exp2_hi_mid * exp2_lo); + return fputil::cast(exp2_hi_mid * exp2_lo); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/expf16.cpp b/libc/src/math/generic/expf16.cpp index 7ffdbd5191008a..0548ef3932ae92 100644 --- a/libc/src/math/generic/expf16.cpp +++ b/libc/src/math/generic/expf16.cpp @@ -13,6 +13,7 @@ #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/except_value_utils.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" @@ -103,7 +104,7 @@ LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) { // > display = hexadecimal; // > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]); // > 1 + x * P; - return static_cast( + return fputil::cast( fputil::polyeval(xf, 0x1p+0f, 0x1p+0f, 0x1.0004p-1f, 0x1.555778p-3f)); } } @@ -113,7 +114,7 @@ LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) { // exp(x) = exp(hi + mid) * exp(lo) auto [exp_hi_mid, exp_lo] = exp_range_reduction(x); - return static_cast(exp_hi_mid * exp_lo); + return fputil::cast(exp_hi_mid * exp_lo); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp index 0facdc510e4287..4ce0efd1f461bb 100644 --- a/libc/src/math/generic/expm1f16.cpp +++ b/libc/src/math/generic/expm1f16.cpp @@ -13,6 +13,7 @@ #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/except_value_utils.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" @@ -99,7 +100,7 @@ LLVM_LIBC_FUNCTION(float16, expm1f16, (float16 x)) { FPBits::one(Sign::NEG).get_val()); // When x <= -0x1.0ap+3, round(expm1(x), HP, RN) = -0x1.ffcp-1. return fputil::round_result_slightly_down( - static_cast(-0x1.ffcp-1)); + fputil::cast(-0x1.ffcp-1)); } // When 0 < |x| <= 2^(-3). @@ -114,7 +115,7 @@ LLVM_LIBC_FUNCTION(float16, expm1f16, (float16 x)) { // > display = hexadecimal; // > P = fpminimax(expm1(x)/x, 4, [|SG...|], [-2^-3, 2^-3]); // > x * P; - return static_cast( + return fputil::cast( xf * fputil::polyeval(xf, 0x1p+0f, 0x1.fffff8p-2f, 0x1.555556p-3f, 0x1.55905ep-5f, 0x1.1124c2p-7f)); } @@ -126,7 +127,7 @@ LLVM_LIBC_FUNCTION(float16, expm1f16, (float16 x)) { // exp(x) = exp(hi + mid) * exp(lo) auto [exp_hi_mid, exp_lo] = exp_range_reduction(x); // expm1(x) = exp(hi + mid) * exp(lo) - 1 - return static_cast(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0f)); + return fputil::cast(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0f)); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp index 3092048f5ab061..361b22729f642e 100644 --- a/libc/src/math/generic/floorf16.cpp +++ b/libc/src/math/generic/floorf16.cpp @@ -8,6 +8,7 @@ #include "src/math/floorf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_floorf(x)); + return fputil::cast(__builtin_floorf(x)); #else return fputil::floor(x); #endif diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp index 3a53dd28e3d109..aefdcbea770644 100644 --- a/libc/src/math/generic/rintf16.cpp +++ b/libc/src/math/generic/rintf16.cpp @@ -8,6 +8,7 @@ #include "src/math/rintf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_rintf(x)); + return fputil::cast(__builtin_rintf(x)); #else return fputil::round_using_current_rounding_mode(x); #endif diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp index c3dbd779b97395..fdcd968bc9b874 100644 --- a/libc/src/math/generic/roundevenf16.cpp +++ b/libc/src/math/generic/roundevenf16.cpp @@ -8,6 +8,7 @@ #include "src/math/roundevenf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_roundevenf(x)); + return fputil::cast(__builtin_roundevenf(x)); #else return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST); #endif diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp index a5e2b44fbd54bd..9adfb52ed27c67 100644 --- a/libc/src/math/generic/roundf16.cpp +++ b/libc/src/math/generic/roundf16.cpp @@ -8,6 +8,7 @@ #include "src/math/roundf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_ROUND) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_roundf(x)); + return fputil::cast(__builtin_roundf(x)); #else return fputil::round(x); #endif diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp index 31b1214a9a0e4b..4d37e6560a965b 100644 --- a/libc/src/math/generic/truncf16.cpp +++ b/libc/src/math/generic/truncf16.cpp @@ -8,6 +8,7 @@ #include "src/math/truncf16.h" #include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" @@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) { #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS) - return static_cast(__builtin_truncf(x)); + return fputil::cast(__builtin_truncf(x)); #else return fputil::trunc(x); #endif diff --git a/libc/test/src/math/smoke/AddTest.h b/libc/test/src/math/smoke/AddTest.h index 88c2067ca14748..f06a0868a520fc 100644 --- a/libc/test/src/math/smoke/AddTest.h +++ b/libc/test/src/math/smoke/AddTest.h @@ -35,22 +35,22 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using AddFunc = OutType (*)(InType, InType); void test_special_numbers(AddFunc func) { - EXPECT_FP_IS_NAN(func(aNaN, aNaN)); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID); + EXPECT_FP_IS_NAN(func(in.aNaN, in.aNaN)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.sNaN, in.sNaN), FE_INVALID); InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val(); - EXPECT_FP_IS_NAN(func(qnan_42, zero)); - EXPECT_FP_IS_NAN(func(zero, qnan_42)); + EXPECT_FP_IS_NAN(func(qnan_42, in.zero)); + EXPECT_FP_IS_NAN(func(in.zero, qnan_42)); - EXPECT_FP_EQ(inf, func(inf, zero)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); - EXPECT_FP_EQ(inf, func(inf, neg_zero)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, neg_zero)); + EXPECT_FP_EQ(inf, func(in.inf, in.zero)); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, in.zero)); + EXPECT_FP_EQ(inf, func(in.inf, in.neg_zero)); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, in.neg_zero)); } void test_invalid_operations(AddFunc func) { - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_inf), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.neg_inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.inf), FE_INVALID); } void test_range_errors(AddFunc func) { @@ -58,10 +58,11 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, + func(in.neg_max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -75,10 +76,11 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, neg_max_normal), + func(in.neg_max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal), @@ -91,9 +93,11 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Downward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, + func(in.neg_max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -107,11 +111,11 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Upward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, neg_max_normal), + func(in.neg_max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, @@ -127,7 +131,7 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } void test_inexact_results(AddFunc func) { - func(InType(1.0), min_denormal); + func(InType(1.0), in.min_denormal); EXPECT_FP_EXCEPTION(FE_INEXACT); } }; diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 47e16926f10df1..9f9203c491d044 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -401,6 +401,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.dfmal + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -413,6 +414,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.dfmaf128 + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -1062,6 +1064,7 @@ add_fp_unittest( libc.hdr.fenv_macros libc.src.errno.errno libc.src.math.expf16 + libc.src.__support.FPUtil.cast ) add_fp_unittest( @@ -1098,6 +1101,7 @@ add_fp_unittest( libc.hdr.fenv_macros libc.src.errno.errno libc.src.math.exp2f16 + libc.src.__support.FPUtil.cast ) add_fp_unittest( @@ -1145,6 +1149,7 @@ add_fp_unittest( libc.hdr.fenv_macros libc.src.errno.errno libc.src.math.exp10f16 + libc.src.__support.FPUtil.cast ) add_fp_unittest( @@ -3317,6 +3322,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.fmaf + libc.src.__support.macros.properties.types FLAGS FMA_OPT__ONLY ) @@ -3331,6 +3337,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.fma + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -3368,6 +3375,7 @@ add_fp_unittest( libc.hdr.fenv_macros libc.src.errno.errno libc.src.math.expm1f16 + libc.src.__support.FPUtil.cast ) add_fp_unittest( @@ -4352,6 +4360,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.f16fma + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4364,6 +4373,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.f16fmaf + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4376,6 +4386,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.f16fmal + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4388,6 +4399,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.f16fmaf128 + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4490,6 +4502,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.ffma + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4502,6 +4515,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.ffmal + libc.src.__support.macros.properties.types ) add_fp_unittest( @@ -4514,6 +4528,7 @@ add_fp_unittest( FmaTest.h DEPENDS libc.src.math.ffmaf128 + libc.src.__support.macros.properties.types ) add_fp_unittest( diff --git a/libc/test/src/math/smoke/DivTest.h b/libc/test/src/math/smoke/DivTest.h index 666179628c55ff..60e7a8adc9eba3 100644 --- a/libc/test/src/math/smoke/DivTest.h +++ b/libc/test/src/math/smoke/DivTest.h @@ -28,45 +28,47 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using InFPBits = typename InConstants::FPBits; using InStorageType = typename InConstants::StorageType; + InConstants in; + public: using DivFunc = OutType (*)(InType, InType); void test_special_numbers(DivFunc func) { - EXPECT_FP_IS_NAN(func(aNaN, aNaN)); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID); + EXPECT_FP_IS_NAN(func(in.aNaN, in.aNaN)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.sNaN, in.sNaN), FE_INVALID); InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val(); - EXPECT_FP_IS_NAN(func(qnan_42, zero)); - EXPECT_FP_IS_NAN(func(zero, qnan_42)); + EXPECT_FP_IS_NAN(func(qnan_42, in.zero)); + EXPECT_FP_IS_NAN(func(in.zero, qnan_42)); - EXPECT_FP_EQ(inf, func(inf, zero)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); - EXPECT_FP_EQ(neg_inf, func(inf, neg_zero)); - EXPECT_FP_EQ(inf, func(neg_inf, neg_zero)); + EXPECT_FP_EQ(inf, func(in.inf, in.zero)); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, in.zero)); + EXPECT_FP_EQ(neg_inf, func(in.inf, in.neg_zero)); + EXPECT_FP_EQ(inf, func(in.neg_inf, in.neg_zero)); } void test_division_by_zero(DivFunc func) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), zero), FE_DIVBYZERO); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(-1.0), zero), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), in.zero), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(-1.0), in.zero), FE_DIVBYZERO); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(1.0), neg_zero), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(InType(1.0), in.neg_zero), FE_DIVBYZERO); - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), zero), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(InType(1.0), in.zero), FE_DIVBYZERO); } void test_invalid_operations(DivFunc func) { - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(zero, zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_zero, zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(zero, neg_zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_zero, neg_zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.zero, in.zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_zero, in.zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.zero, in.neg_zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_zero, in.neg_zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.neg_inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.neg_inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); } @@ -74,64 +76,72 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, min_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.min_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, min_denormal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, + func(in.neg_max_normal, in.min_denormal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, min_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.min_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, min_denormal), + func(in.neg_max_normal, in.min_denormal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } if (ForceRoundingMode r(RoundingMode::Downward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, min_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.min_normal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, min_denormal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, + func(in.neg_max_normal, in.min_denormal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION(neg_min_denormal, - func(neg_min_denormal, max_normal), + func(in.neg_min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } if (ForceRoundingMode r(RoundingMode::Upward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, min_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.min_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, min_denormal), + func(in.neg_max_normal, in.min_denormal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, func(min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, + func(in.min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_min_denormal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.max_normal), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h index 0a4227da83f81d..ad9688fc01e7c1 100644 --- a/libc/test/src/math/smoke/FModTest.h +++ b/libc/test/src/math/smoke/FModTest.h @@ -108,61 +108,61 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(T(3.0), neg_inf, T(3.0), false, 0); TEST_SPECIAL(zero, aNaN, aNaN, false, 0); - TEST_SPECIAL(zero, -aNaN, aNaN, false, 0); + TEST_SPECIAL(zero, neg_aNaN, aNaN, false, 0); TEST_SPECIAL(neg_zero, aNaN, aNaN, false, 0); - TEST_SPECIAL(neg_zero, -aNaN, aNaN, false, 0); + TEST_SPECIAL(neg_zero, neg_aNaN, aNaN, false, 0); TEST_SPECIAL(T(1.0), aNaN, aNaN, false, 0); - TEST_SPECIAL(T(1.0), -aNaN, aNaN, false, 0); + TEST_SPECIAL(T(1.0), neg_aNaN, aNaN, false, 0); TEST_SPECIAL(inf, aNaN, aNaN, false, 0); - TEST_SPECIAL(inf, -aNaN, aNaN, false, 0); + TEST_SPECIAL(inf, neg_aNaN, aNaN, false, 0); TEST_SPECIAL(neg_inf, aNaN, aNaN, false, 0); - TEST_SPECIAL(neg_inf, -aNaN, aNaN, false, 0); + TEST_SPECIAL(neg_inf, neg_aNaN, aNaN, false, 0); TEST_SPECIAL(zero, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(zero, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(zero, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_zero, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(neg_zero, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_zero, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(T(1.0), sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(T(1.0), -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(T(1.0), neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(inf, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(inf, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(inf, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_inf, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(neg_inf, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_inf, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(aNaN, zero, aNaN, false, 0); - TEST_SPECIAL(-aNaN, zero, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, zero, aNaN, false, 0); TEST_SPECIAL(aNaN, neg_zero, aNaN, false, 0); - TEST_SPECIAL(-aNaN, neg_zero, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, neg_zero, aNaN, false, 0); TEST_SPECIAL(aNaN, T(1.0), aNaN, false, 0); - TEST_SPECIAL(-aNaN, T(1.0), aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, T(1.0), aNaN, false, 0); TEST_SPECIAL(aNaN, inf, aNaN, false, 0); - TEST_SPECIAL(-aNaN, inf, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, inf, aNaN, false, 0); TEST_SPECIAL(aNaN, neg_inf, aNaN, false, 0); - TEST_SPECIAL(-aNaN, neg_inf, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, neg_inf, aNaN, false, 0); TEST_SPECIAL(sNaN, zero, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, zero, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, zero, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, neg_zero, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, neg_zero, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, neg_zero, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, T(1.0), aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, T(1.0), aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, T(1.0), aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, inf, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, inf, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, inf, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, neg_inf, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, neg_inf, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, neg_inf, aNaN, false, FE_INVALID); TEST_SPECIAL(aNaN, aNaN, aNaN, false, 0); - TEST_SPECIAL(aNaN, -aNaN, aNaN, false, 0); - TEST_SPECIAL(-aNaN, aNaN, aNaN, false, 0); - TEST_SPECIAL(-aNaN, -aNaN, aNaN, false, 0); + TEST_SPECIAL(aNaN, neg_aNaN, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, aNaN, aNaN, false, 0); + TEST_SPECIAL(neg_aNaN, neg_aNaN, aNaN, false, 0); TEST_SPECIAL(aNaN, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(aNaN, -sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-aNaN, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-aNaN, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(aNaN, neg_sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_aNaN, sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_aNaN, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, aNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(sNaN, -aNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, aNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, -aNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(sNaN, neg_aNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, aNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, neg_aNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(sNaN, -sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(sNaN, neg_sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(neg_sNaN, neg_sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(T(6.5), T(2.25), T(2.0), false, 0); TEST_SPECIAL(T(-6.5), T(2.25), T(-2.0), false, 0); diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h index bf6d06d698fde5..41093422d51b2e 100644 --- a/libc/test/src/math/smoke/FmaTest.h +++ b/libc/test/src/math/smoke/FmaTest.h @@ -9,6 +9,9 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_FMATEST_H #define LLVM_LIBC_TEST_SRC_MATH_FMATEST_H +#include "src/__support/CPP/type_traits.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/macros/properties/types.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -37,6 +40,11 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { OutConstants out; InConstants in; + const InType in_out_min_normal = + LIBC_NAMESPACE::fputil::cast(out.min_normal); + const InType in_out_min_denormal = + LIBC_NAMESPACE::fputil::cast(out.min_denormal); + public: using FmaFunc = OutType (*)(InType, InType, InType); @@ -52,7 +60,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { // Test underflow rounding up. EXPECT_FP_EQ(OutFPBits(OutStorageType(2)).get_val(), - func(OutType(0.5), out.min_denormal, out.min_denormal)); + func(InType(0.5), in_out_min_denormal, in_out_min_denormal)); if constexpr (sizeof(OutType) < sizeof(InType)) { EXPECT_FP_EQ(out.zero, @@ -63,8 +71,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { OutType v = OutFPBits(static_cast(OUT_MIN_NORMAL_U + OutStorageType(1))) .get_val(); - EXPECT_FP_EQ(v, func(OutType(1) / OutType(OUT_MIN_NORMAL_U << 1), v, - out.min_normal)); + EXPECT_FP_EQ(v, func(InType(1) / InType(OUT_MIN_NORMAL_U << 1), + LIBC_NAMESPACE::fputil::cast(v), + in_out_min_normal)); if constexpr (sizeof(OutType) < sizeof(InType)) { InFPBits tmp = InFPBits::one(); @@ -74,12 +83,21 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { InType v = InFPBits(static_cast(IN_MIN_NORMAL_U + InStorageType(1))) .get_val(); - EXPECT_FP_EQ(out.min_normal, func(reciprocal_value, v, out.min_normal)); + EXPECT_FP_EQ(out.min_normal, + func(reciprocal_value, v, in_out_min_normal)); } // Test overflow. OutType z = out.max_normal; - EXPECT_FP_EQ_ALL_ROUNDING(OutType(0.75) * z, func(InType(1.75), z, -z)); + InType in_z = LIBC_NAMESPACE::fputil::cast(out.max_normal); +#if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION) + // Rounding modes other than the default might not be usable with float16. + if constexpr (LIBC_NAMESPACE::cpp::is_same_v) + EXPECT_FP_EQ(OutType(0.75) * z, func(InType(1.75), in_z, -in_z)); + else +#endif + EXPECT_FP_EQ_ALL_ROUNDING(OutType(0.75) * z, + func(InType(1.75), in_z, -in_z)); // Exact cancellation. EXPECT_FP_EQ_ROUNDING_NEAREST( diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h index 6226e5d55f40cc..24cfb1152c2e5f 100644 --- a/libc/test/src/math/smoke/ModfTest.h +++ b/libc/test/src/math/smoke/ModfTest.h @@ -97,7 +97,7 @@ class ModfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { T integral; T frac = func(x, &integral); - ASSERT_TRUE(LIBC_NAMESPACE::fputil::abs(frac) < 1.0l); + ASSERT_TRUE(LIBC_NAMESPACE::fputil::abs(frac) < T(1.0)); ASSERT_TRUE(LIBC_NAMESPACE::fputil::trunc(x) == integral); ASSERT_TRUE(integral + frac == x); } diff --git a/libc/test/src/math/smoke/MulTest.h b/libc/test/src/math/smoke/MulTest.h index 0c847e39687b72..c409122397b1d7 100644 --- a/libc/test/src/math/smoke/MulTest.h +++ b/libc/test/src/math/smoke/MulTest.h @@ -34,22 +34,22 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using MulFunc = OutType (*)(InType, InType); void test_special_numbers(MulFunc func) { - EXPECT_FP_IS_NAN(func(aNaN, aNaN)); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID); + EXPECT_FP_IS_NAN(func(in.aNaN, in.aNaN)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.sNaN, in.sNaN), FE_INVALID); InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val(); - EXPECT_FP_IS_NAN(func(qnan_42, zero)); - EXPECT_FP_IS_NAN(func(zero, qnan_42)); + EXPECT_FP_IS_NAN(func(qnan_42, in.zero)); + EXPECT_FP_IS_NAN(func(in.zero, qnan_42)); - EXPECT_FP_EQ(inf, func(inf, InType(1.0))); - EXPECT_FP_EQ(neg_inf, func(neg_inf, InType(1.0))); - EXPECT_FP_EQ(neg_inf, func(inf, InType(-1.0))); - EXPECT_FP_EQ(inf, func(neg_inf, InType(-1.0))); + EXPECT_FP_EQ(inf, func(in.inf, InType(1.0))); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, InType(1.0))); + EXPECT_FP_EQ(neg_inf, func(in.inf, InType(-1.0))); + EXPECT_FP_EQ(inf, func(in.neg_inf, InType(-1.0))); - EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero, zero)); - EXPECT_FP_EQ_ALL_ROUNDING(zero, func(neg_zero, neg_zero)); - EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(zero, neg_zero)); - EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(in.zero, in.zero)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(in.neg_zero, in.neg_zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(in.zero, in.neg_zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(in.neg_zero, in.zero)); EXPECT_FP_EQ_ALL_ROUNDING(OutType(1.0), func(1.0, 1.0)); EXPECT_FP_EQ_ALL_ROUNDING(OutType(15.0), func(3.0, 5.0)); @@ -58,20 +58,21 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } void test_invalid_operations(MulFunc func) { - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, zero), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.neg_zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.neg_zero), FE_INVALID); } void test_range_errors(MulFunc func) { using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -85,10 +86,11 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, max_normal), + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal), @@ -101,9 +103,11 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Downward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -117,11 +121,11 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Upward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, max_normal), + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h index 5992273d919012..61528f71305db0 100644 --- a/libc/test/src/math/smoke/NextTowardTest.h +++ b/libc/test/src/math/smoke/NextTowardTest.h @@ -43,6 +43,8 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { const T neg_zero = FPBits::zero(Sign::NEG).get_val(); const T nan = FPBits::quiet_nan().get_val(); + const long double to_inf = ToFPBits::inf(Sign::POS).get_val(); + const long double to_neg_inf = ToFPBits::inf(Sign::NEG).get_val(); const long double to_zero = ToFPBits::zero().get_val(); const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val(); const long double to_nan = ToFPBits::quiet_nan().get_val(); @@ -134,7 +136,7 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ_WITH_UNDERFLOW(result, expected); - result = func(x, inf); + result = func(x, to_inf); expected_bits = min_normal + 1; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); @@ -145,7 +147,7 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ_WITH_UNDERFLOW(result, expected); - result = func(x, -inf); + result = func(x, to_neg_inf); expected_bits = FPBits::SIGN_MASK + min_normal + 1; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); @@ -156,14 +158,14 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { expected_bits = max_normal - 1; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); - ASSERT_FP_EQ_WITH_OVERFLOW(func(x, inf), inf); + ASSERT_FP_EQ_WITH_OVERFLOW(func(x, to_inf), inf); x = -x; result = func(x, 0); expected_bits = FPBits::SIGN_MASK + max_normal - 1; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); - ASSERT_FP_EQ_WITH_OVERFLOW(func(x, -inf), -inf); + ASSERT_FP_EQ_WITH_OVERFLOW(func(x, to_neg_inf), neg_inf); // 'from' is infinity. x = inf; @@ -171,14 +173,14 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { expected_bits = max_normal; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); - ASSERT_FP_EQ(func(x, inf), inf); + ASSERT_FP_EQ(func(x, to_inf), inf); x = neg_inf; result = func(x, 0); expected_bits = FPBits::SIGN_MASK + max_normal; expected = LIBC_NAMESPACE::cpp::bit_cast(expected_bits); ASSERT_FP_EQ(result, expected); - ASSERT_FP_EQ(func(x, neg_inf), neg_inf); + ASSERT_FP_EQ(func(x, to_neg_inf), neg_inf); // 'from' is a power of 2. x = T(32.0); diff --git a/libc/test/src/math/smoke/SqrtTest.h b/libc/test/src/math/smoke/SqrtTest.h index ce9f2f85b4604a..b5eaee22fc79dd 100644 --- a/libc/test/src/math/smoke/SqrtTest.h +++ b/libc/test/src/math/smoke/SqrtTest.h @@ -15,15 +15,21 @@ class SqrtTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { DECLARE_SPECIAL_CONSTANTS(OutType) + struct InConstants { + DECLARE_SPECIAL_CONSTANTS(InType) + }; + + InConstants in; + public: typedef OutType (*SqrtFunc)(InType); void test_special_numbers(SqrtFunc func) { - ASSERT_FP_EQ(aNaN, func(aNaN)); - ASSERT_FP_EQ(inf, func(inf)); - ASSERT_FP_EQ(aNaN, func(neg_inf)); - ASSERT_FP_EQ(zero, func(zero)); - ASSERT_FP_EQ(neg_zero, func(neg_zero)); + ASSERT_FP_EQ(aNaN, func(in.aNaN)); + ASSERT_FP_EQ(inf, func(in.inf)); + ASSERT_FP_EQ(aNaN, func(in.neg_inf)); + ASSERT_FP_EQ(zero, func(in.zero)); + ASSERT_FP_EQ(neg_zero, func(in.neg_zero)); ASSERT_FP_EQ(aNaN, func(InType(-1.0))); ASSERT_FP_EQ(OutType(1.0), func(InType(1.0))); ASSERT_FP_EQ(OutType(2.0), func(InType(4.0))); diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h index 99c4b6c760af72..8793b9f157f721 100644 --- a/libc/test/src/math/smoke/SubTest.h +++ b/libc/test/src/math/smoke/SubTest.h @@ -34,22 +34,22 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using SubFunc = OutType (*)(InType, InType); void test_special_numbers(SubFunc func) { - EXPECT_FP_IS_NAN(func(aNaN, aNaN)); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID); + EXPECT_FP_IS_NAN(func(in.aNaN, in.aNaN)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.sNaN, in.sNaN), FE_INVALID); InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val(); - EXPECT_FP_IS_NAN(func(qnan_42, zero)); - EXPECT_FP_IS_NAN(func(zero, qnan_42)); + EXPECT_FP_IS_NAN(func(qnan_42, in.zero)); + EXPECT_FP_IS_NAN(func(in.zero, qnan_42)); - EXPECT_FP_EQ(inf, func(inf, zero)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); - EXPECT_FP_EQ(inf, func(inf, neg_zero)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, neg_zero)); + EXPECT_FP_EQ(inf, func(in.inf, in.zero)); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, in.zero)); + EXPECT_FP_EQ(inf, func(in.inf, in.neg_zero)); + EXPECT_FP_EQ(neg_inf, func(in.neg_inf, in.neg_zero)); } void test_invalid_operations(SubFunc func) { - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, inf), FE_INVALID); - EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.inf, in.inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(in.neg_inf, in.neg_inf), FE_INVALID); } void test_range_errors(SubFunc func) { @@ -57,10 +57,10 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -75,10 +75,11 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, max_normal), + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(zero, @@ -92,9 +93,10 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Downward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, + func(in.max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(neg_max_normal, max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(-inf, func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); @@ -109,11 +111,11 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } if (ForceRoundingMode r(RoundingMode::Upward); r.success) { - EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, neg_max_normal), + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(in.max_normal, in.neg_max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, - func(neg_max_normal, max_normal), + func(in.neg_max_normal, in.max_normal), FE_OVERFLOW | FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, @@ -129,7 +131,7 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } void test_inexact_results(SubFunc func) { - func(InType(1.0), min_denormal); + func(InType(1.0), in.min_denormal); EXPECT_FP_EXCEPTION(FE_INEXACT); } }; diff --git a/libc/test/src/math/smoke/exp10f16_test.cpp b/libc/test/src/math/smoke/exp10f16_test.cpp index 006dfafa8aa141..1c4ef2aa08a70a 100644 --- a/libc/test/src/math/smoke/exp10f16_test.cpp +++ b/libc/test/src/math/smoke/exp10f16_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" #include "src/errno/libc_errno.h" #include "src/math/exp10f16.h" #include "test/UnitTest/FPMatcher.h" @@ -26,15 +27,14 @@ TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) { EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp10f16(inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(zero), - LIBC_NAMESPACE::exp10f16(neg_inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10f16(neg_inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::exp10f16(zero)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::exp10f16(neg_zero)); EXPECT_MATH_ERRNO(0); } @@ -47,7 +47,8 @@ TEST_F(LlvmLibcExp10f16Test, Overflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - inf, LIBC_NAMESPACE::exp10f16(static_cast(5.0)), FE_OVERFLOW); + inf, LIBC_NAMESPACE::exp10f16(LIBC_NAMESPACE::fputil::cast(5.0)), + FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); } @@ -59,7 +60,8 @@ TEST_F(LlvmLibcExp10f16Test, Underflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - zero, LIBC_NAMESPACE::exp10f16(static_cast(-8.0)), + zero, + LIBC_NAMESPACE::exp10f16(LIBC_NAMESPACE::fputil::cast(-8.0)), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } diff --git a/libc/test/src/math/smoke/exp2f16_test.cpp b/libc/test/src/math/smoke/exp2f16_test.cpp index cd87e6134557a5..f69b33a3cf37fe 100644 --- a/libc/test/src/math/smoke/exp2f16_test.cpp +++ b/libc/test/src/math/smoke/exp2f16_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" #include "src/errno/libc_errno.h" #include "src/math/exp2f16.h" #include "test/UnitTest/FPMatcher.h" @@ -26,15 +27,14 @@ TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) { EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp2f16(inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(zero), - LIBC_NAMESPACE::exp2f16(neg_inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp2f16(neg_inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::exp2f16(zero)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::exp2f16(neg_zero)); EXPECT_MATH_ERRNO(0); } @@ -47,7 +47,8 @@ TEST_F(LlvmLibcExp2f16Test, Overflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - inf, LIBC_NAMESPACE::exp2f16(static_cast(16.0)), FE_OVERFLOW); + inf, LIBC_NAMESPACE::exp2f16(LIBC_NAMESPACE::fputil::cast(16.0)), + FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); } @@ -59,7 +60,8 @@ TEST_F(LlvmLibcExp2f16Test, Underflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - zero, LIBC_NAMESPACE::exp2f16(static_cast(-25.0)), + zero, + LIBC_NAMESPACE::exp2f16(LIBC_NAMESPACE::fputil::cast(-25.0)), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp index 969870fe247bc2..ab745a3cf6f563 100644 --- a/libc/test/src/math/smoke/expf16_test.cpp +++ b/libc/test/src/math/smoke/expf16_test.cpp @@ -8,6 +8,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" #include "src/errno/libc_errno.h" #include "src/math/expf16.h" #include "test/UnitTest/FPMatcher.h" @@ -27,15 +28,14 @@ TEST_F(LlvmLibcExpf16Test, SpecialNumbers) { EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::expf16(inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(zero), - LIBC_NAMESPACE::expf16(neg_inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::expf16(neg_inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::expf16(zero)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(1.0f), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0f), LIBC_NAMESPACE::expf16(neg_zero)); EXPECT_MATH_ERRNO(0); } @@ -48,7 +48,8 @@ TEST_F(LlvmLibcExpf16Test, Overflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - inf, LIBC_NAMESPACE::expf16(static_cast(12.0)), FE_OVERFLOW); + inf, LIBC_NAMESPACE::expf16(LIBC_NAMESPACE::fputil::cast(12.0)), + FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); } @@ -60,7 +61,8 @@ TEST_F(LlvmLibcExpf16Test, Underflow) { EXPECT_MATH_ERRNO(ERANGE); EXPECT_FP_EQ_WITH_EXCEPTION( - zero, LIBC_NAMESPACE::expf16(static_cast(-18.0)), + zero, + LIBC_NAMESPACE::expf16(LIBC_NAMESPACE::fputil::cast(-18.0)), FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp index 3bdbaad2279416..f297c5dfc3c7e1 100644 --- a/libc/test/src/math/smoke/expm1f16_test.cpp +++ b/libc/test/src/math/smoke/expm1f16_test.cpp @@ -8,6 +8,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" #include "src/errno/libc_errno.h" #include "src/math/expm1f16.h" #include "test/UnitTest/FPMatcher.h" @@ -27,7 +28,7 @@ TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) { EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::expm1f16(inf)); EXPECT_MATH_ERRNO(0); - EXPECT_FP_EQ_ALL_ROUNDING(static_cast(-1.0), + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(neg_inf)); EXPECT_MATH_ERRNO(0); @@ -46,7 +47,7 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) { EXPECT_MATH_ERRNO(ERANGE); // round(16 * log(2), HP, RN); - float16 x = static_cast(0x1.63p+3); + float16 x = LIBC_NAMESPACE::fputil::cast(0x1.63p+3); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::expm1f16(x), FE_OVERFLOW | FE_INEXACT); @@ -68,41 +69,44 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) { TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) { LIBC_NAMESPACE::libc_errno = 0; - EXPECT_FP_EQ_WITH_EXCEPTION(static_cast(-1.0), + EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(neg_max_normal), FE_INEXACT); // round(-11 * log(2), HP, RN); - float16 x = static_cast(-0x1.e8p+2); + float16 x = LIBC_NAMESPACE::fputil::cast(-0x1.e8p+2); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( - static_cast(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x), - FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(static_cast(-0x1.ffcp-1), - LIBC_NAMESPACE::expm1f16(x), - FE_INEXACT); + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( - static_cast(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(x), + FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( - static_cast(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x), - FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); - x = static_cast(-0x1.0a4p+3); + x = LIBC_NAMESPACE::fputil::cast(-0x1.0a4p+3); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( - static_cast(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(x), + FE_INEXACT); - EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(static_cast(-0x1.ffcp-1), - LIBC_NAMESPACE::expm1f16(x), - FE_INEXACT); + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( - static_cast(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(x), + FE_INEXACT); EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( - static_cast(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x), - FE_INEXACT); + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::expm1f16(x), FE_INEXACT); } diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt index 941d3cf004d483..0101c9f3990822 100644 --- a/libc/utils/MPFRWrapper/CMakeLists.txt +++ b/libc/utils/MPFRWrapper/CMakeLists.txt @@ -14,6 +14,7 @@ if(LIBC_TESTS_CAN_USE_MPFR) libc.src.__support.CPP.stringstream libc.src.__support.CPP.string_view libc.src.__support.CPP.type_traits + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fpbits_str LibcTest.unit diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index 7ce6a70d093169..27ff1f7190ef95 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -13,6 +13,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/CPP/stringstream.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/types.h" @@ -683,7 +684,7 @@ template <> long double MPFRNumber::as() const { template <> float16 MPFRNumber::as() const { // TODO: Either prove that this cast won't cause double-rounding errors, or // find a better way to get a float16. - return static_cast(mpfr_get_d(value, mpfr_rounding)); + return fputil::cast(mpfr_get_d(value, mpfr_rounding)); } #endif diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index da4cce6eac0c90..1c8bb057b09660 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -281,7 +281,7 @@ "`LWG3631 `__","``basic_format_arg(T&&)`` should use ``remove_cvref_t`` throughout","2023-02 (Issaquah)","|Complete|","17.0","" "`LWG3645 `__","``resize_and_overwrite`` is overspecified to call its callback with lvalues","2023-02 (Issaquah)","|Complete|","14.0","" "`LWG3655 `__","The ``INVOKE`` operation and union types","2023-02 (Issaquah)","|Complete|","18.0","" -"`LWG3723 `__","``priority_queue::push_range`` needs to ``append_range``","2023-02 (Issaquah)","","","" +"`LWG3723 `__","``priority_queue::push_range`` needs to ``append_range``","2023-02 (Issaquah)","|Complete|","17.0","" "`LWG3734 `__","Inconsistency in ``inout_ptr`` and ``out_ptr`` for empty case","2023-02 (Issaquah)","|Complete|","19.0","" "`LWG3772 `__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","2023-02 (Issaquah)","|Complete|","17.0","" "`LWG3786 `__","Flat maps' deduction guide needs to default ``Allocator`` to be useful","2023-02 (Issaquah)","","","" diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 3484e66d2b1d4d..1bbd2e1f21d7c3 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1215,7 +1215,7 @@ template void ObjFile::importCmseSymbols() { continue; } - if (symtab.cmseImportLib.count(sym->getName())) { + if (ctx.symtab->cmseImportLib.count(sym->getName())) { error("CMSE symbol '" + sym->getName() + "' is multiply defined in import library '" + toString(this) + "'"); continue; @@ -1227,7 +1227,7 @@ template void ObjFile::importCmseSymbols() { Twine(ACLESESYM_SIZE) + " bytes"); } - symtab.cmseImportLib[sym->getName()] = sym; + ctx.symtab->cmseImportLib[sym->getName()] = sym; } } @@ -1263,9 +1263,9 @@ static std::string checkCmseSymAttributes(Symbol *acleSeSym, Symbol *sym) { void elf::processArmCmseSymbols() { if (!ctx.arg.cmseImplib) return; - // Only symbols with external linkage end up in symtab, so no need to do + // Only symbols with external linkage end up in ctx.symtab, so no need to do // linkage checks. Only check symbol type. - for (Symbol *acleSeSym : symtab.getSymbols()) { + for (Symbol *acleSeSym : ctx.symtab->getSymbols()) { if (!acleSeSym->getName().starts_with(ACLESESYM_PREFIX)) continue; // If input object build attributes do not support CMSE, error and disable @@ -1279,7 +1279,7 @@ void elf::processArmCmseSymbols() { // Try to find the associated symbol definition. // Symbol must have external linkage. StringRef name = acleSeSym->getName().substr(std::strlen(ACLESESYM_PREFIX)); - Symbol *sym = symtab.find(name); + Symbol *sym = ctx.symtab->find(name); if (!sym) { error(toString(acleSeSym->file) + ": cmse special symbol '" + acleSeSym->getName() + @@ -1295,7 +1295,7 @@ void elf::processArmCmseSymbols() { } // may be redefined later in the link in .gnu.sgstubs - symtab.cmseSymMap[name] = {acleSeSym, sym}; + ctx.symtab->cmseSymMap[name] = {acleSeSym, sym}; } // If this is an Arm CMSE secure app, replace references to entry symbol @@ -1304,8 +1304,8 @@ void elf::processArmCmseSymbols() { MutableArrayRef syms = file->getMutableSymbols(); for (size_t i = 0, e = syms.size(); i != e; ++i) { StringRef symName = syms[i]->getName(); - if (symtab.cmseSymMap.count(symName)) - syms[i] = symtab.cmseSymMap[symName].acleSeSym; + if (ctx.symtab->cmseSymMap.count(symName)) + syms[i] = ctx.symtab->cmseSymMap[symName].acleSeSym; } }); } @@ -1332,26 +1332,26 @@ ArmCmseSGSection::ArmCmseSGSection() /*alignment=*/32, ".gnu.sgstubs") { entsize = ACLESESYM_SIZE; // The range of addresses used in the CMSE import library should be fixed. - for (auto &[_, sym] : symtab.cmseImportLib) { + for (auto &[_, sym] : ctx.symtab->cmseImportLib) { if (impLibMaxAddr <= sym->value) impLibMaxAddr = sym->value + sym->size; } - if (symtab.cmseSymMap.empty()) + if (ctx.symtab->cmseSymMap.empty()) return; addMappingSymbol(); - for (auto &[_, entryFunc] : symtab.cmseSymMap) + for (auto &[_, entryFunc] : ctx.symtab->cmseSymMap) addSGVeneer(cast(entryFunc.acleSeSym), cast(entryFunc.sym)); - for (auto &[_, sym] : symtab.cmseImportLib) { - if (!symtab.inCMSEOutImpLib.count(sym->getName())) + for (auto &[_, sym] : ctx.symtab->cmseImportLib) { + if (!ctx.symtab->inCMSEOutImpLib.count(sym->getName())) warn("entry function '" + sym->getName() + "' from CMSE import library is not present in secure application"); } - if (!symtab.cmseImportLib.empty() && ctx.arg.cmseOutputLib.empty()) { - for (auto &[_, entryFunc] : symtab.cmseSymMap) { + if (!ctx.symtab->cmseImportLib.empty() && ctx.arg.cmseOutputLib.empty()) { + for (auto &[_, entryFunc] : ctx.symtab->cmseSymMap) { Symbol *sym = entryFunc.sym; - if (!symtab.inCMSEOutImpLib.count(sym->getName())) + if (!ctx.symtab->inCMSEOutImpLib.count(sym->getName())) warn("new entry function '" + sym->getName() + "' introduced but no output import library specified"); } @@ -1360,8 +1360,8 @@ ArmCmseSGSection::ArmCmseSGSection() void ArmCmseSGSection::addSGVeneer(Symbol *acleSeSym, Symbol *sym) { entries.emplace_back(acleSeSym, sym); - if (symtab.cmseImportLib.count(sym->getName())) - symtab.inCMSEOutImpLib[sym->getName()] = true; + if (ctx.symtab->cmseImportLib.count(sym->getName())) + ctx.symtab->inCMSEOutImpLib[sym->getName()] = true; // Symbol addresses different, nothing to do. if (acleSeSym->file != sym->file || cast(*acleSeSym).value != cast(*sym).value) @@ -1369,8 +1369,8 @@ void ArmCmseSGSection::addSGVeneer(Symbol *acleSeSym, Symbol *sym) { // Only secure symbols with values equal to that of it's non-secure // counterpart needs to be in the .gnu.sgstubs section. ArmCmseSGVeneer *ss = nullptr; - if (symtab.cmseImportLib.count(sym->getName())) { - Defined *impSym = symtab.cmseImportLib[sym->getName()]; + if (ctx.symtab->cmseImportLib.count(sym->getName())) { + Defined *impSym = ctx.symtab->cmseImportLib[sym->getName()]; ss = make(sym, acleSeSym, impSym->value); } else { ss = make(sym, acleSeSym); @@ -1451,12 +1451,12 @@ template void elf::writeARMCmseImportLib() { osIsPairs.emplace_back(make(impSymTab->name, 0, 0), impSymTab); osIsPairs.emplace_back(make(shstrtab->name, 0, 0), shstrtab); - std::sort(symtab.cmseSymMap.begin(), symtab.cmseSymMap.end(), + std::sort(ctx.symtab->cmseSymMap.begin(), ctx.symtab->cmseSymMap.end(), [](const auto &a, const auto &b) -> bool { return a.second.sym->getVA() < b.second.sym->getVA(); }); // Copy the secure gateway entry symbols to the import library symbol table. - for (auto &p : symtab.cmseSymMap) { + for (auto &p : ctx.symtab->cmseSymMap) { Defined *d = cast(p.second.sym); impSymTab->addSymbol(makeDefined( ctx.internalFile, d->getName(), d->computeBinding(), diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index 803cc5402dda3c..fdf3d07b98bca1 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -251,7 +251,7 @@ void elf::writePrefixedInstruction(uint8_t *loc, uint64_t insn) { static bool addOptional(StringRef name, uint64_t value, std::vector &defined) { - Symbol *sym = symtab.find(name); + Symbol *sym = ctx.symtab->find(name); if (!sym || sym->isDefined()) return false; sym->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN, diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 875463da056474..80a45bc4b63793 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -44,6 +44,7 @@ class InputSectionBase; class EhInputSection; class Defined; class Symbol; +class SymbolTable; class BitcodeCompiler; class OutputSection; class LinkerScript; @@ -600,6 +601,7 @@ struct Ctx { Defined *tlsModuleBase; }; ElfSym sym; + std::unique_ptr symtab; SmallVector> memoryBuffers; SmallVector objectFiles; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index f3ead3f50d8a03..206e358a07c2ee 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -109,6 +109,7 @@ void Ctx::reset() { in.reset(); sym = ElfSym{}; + symtab = std::make_unique(); memoryBuffers.clear(); objectFiles.clear(); @@ -155,7 +156,6 @@ bool link(ArrayRef args, llvm::raw_ostream &stdoutOS, context->e.cleanupCallback = []() { elf::ctx.reset(); elf::ctx.partitions.emplace_back(); - symtab = SymbolTable(); SharedFile::vernauxNum = 0; }; @@ -167,6 +167,7 @@ bool link(ArrayRef args, llvm::raw_ostream &stdoutOS, LinkerScript script(ctx); ctx.script = &script; ctx.symAux.emplace_back(); + ctx.symtab = std::make_unique(); ctx.partitions.clear(); ctx.partitions.emplace_back(); @@ -2207,7 +2208,7 @@ static void handleUndefinedGlob(Ctx &ctx, StringRef arg) { // Calling sym->extract() in the loop is not safe because it may add new // symbols to the symbol table, invalidating the current iterator. SmallVector syms; - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (!sym->isPlaceholder() && pat->match(sym->getName())) syms.push_back(sym); @@ -2216,7 +2217,7 @@ static void handleUndefinedGlob(Ctx &ctx, StringRef arg) { } static void handleLibcall(Ctx &ctx, StringRef name) { - Symbol *sym = symtab.find(name); + Symbol *sym = ctx.symtab->find(name); if (sym && sym->isLazy() && isa(sym->file)) { if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back("", sym->file, *sym); @@ -2403,7 +2404,7 @@ template static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) { for (auto *arg : args.filtered(OPT_keep_unique)) { StringRef name = arg->getValue(); - auto *d = dyn_cast_or_null(symtab.find(name)); + auto *d = dyn_cast_or_null(ctx.symtab->find(name)); if (!d || !d->section) { warn("could not find symbol " + name + " to keep unique"); continue; @@ -2418,7 +2419,7 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) { // Symbols in the dynsym could be address-significant in other executables // or DSOs, so we conservatively mark them as address-significant. - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (sym->includeInDynsym()) markAddrsig(sym); @@ -2587,24 +2588,24 @@ static std::vector addWrappedSymbols(opt::InputArgList &args) { if (!seen.insert(name).second) continue; - Symbol *sym = symtab.find(name); + Symbol *sym = ctx.symtab->find(name); if (!sym) continue; - Symbol *wrap = - symtab.addUnusedUndefined(saver().save("__wrap_" + name), sym->binding); + Symbol *wrap = ctx.symtab->addUnusedUndefined( + saver().save("__wrap_" + name), sym->binding); // If __real_ is referenced, pull in the symbol if it is lazy. Do this after // processing __wrap_ as that may have referenced __real_. StringRef realName = saver().save("__real_" + name); - if (Symbol *real = symtab.find(realName)) { - symtab.addUnusedUndefined(name, sym->binding); + if (Symbol *real = ctx.symtab->find(realName)) { + ctx.symtab->addUnusedUndefined(name, sym->binding); // Update sym's binding, which will replace real's later in // SymbolTable::wrap. sym->binding = real->binding; } - Symbol *real = symtab.addUnusedUndefined(realName); + Symbol *real = ctx.symtab->addUnusedUndefined(realName); v.push_back({sym, real, wrap}); // We want to tell LTO not to inline symbols to be overwritten @@ -2639,7 +2640,7 @@ static void combineVersionedSymbol(Symbol &sym, // // * There is a definition of foo@v1 and foo@@v1. // * There is a definition of foo@v1 and foo. - Defined *sym2 = dyn_cast_or_null(symtab.find(sym.getName())); + Defined *sym2 = dyn_cast_or_null(ctx.symtab->find(sym.getName())); if (!sym2) return; const char *suffix2 = sym2->getVersionSuffix(); @@ -2694,7 +2695,7 @@ static void redirectSymbols(Ctx &ctx, ArrayRef wrapped) { // symbols with a non-default version (foo@v1) and check whether it should be // combined with foo or foo@@v1. if (ctx.arg.versionDefinitions.size() > 2) - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (sym->hasVersionSuffix) combineVersionedSymbol(*sym, map); @@ -2710,7 +2711,7 @@ static void redirectSymbols(Ctx &ctx, ArrayRef wrapped) { // Update pointers in the symbol table. for (const WrappedSymbol &w : wrapped) - symtab.wrap(w.sym, w.real, w.wrap); + ctx.symtab->wrap(w.sym, w.real, w.wrap); } static void reportMissingFeature(StringRef config, const Twine &report) { @@ -2874,14 +2875,14 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Handle --trace-symbol. for (auto *arg : args.filtered(OPT_trace_symbol)) - symtab.insert(arg->getValue())->traced = true; + ctx.symtab->insert(arg->getValue())->traced = true; ctx.internalFile = createInternalFile(""); // Handle -u/--undefined before input files. If both a.a and b.so define foo, // -u foo a.a b.so will extract a.a. for (StringRef name : ctx.arg.undefined) - symtab.addUnusedUndefined(name)->referenced = true; + ctx.symtab->addUnusedUndefined(name)->referenced = true; parseFiles(files, armCmseImpLib); @@ -2889,7 +2890,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { ctx.arg.hasDynSymTab = !ctx.sharedFiles.empty() || ctx.arg.isPic; // If an entry symbol is in a static archive, pull out that file now. - if (Symbol *sym = symtab.find(ctx.arg.entry)) + if (Symbol *sym = ctx.symtab->find(ctx.arg.entry)) handleUndefined(ctx, sym, "--entry"); // Handle the `--undefined-glob ` options. @@ -2903,13 +2904,13 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Prevent LTO from removing any definition referenced by -u. for (StringRef name : ctx.arg.undefined) - if (Defined *sym = dyn_cast_or_null(symtab.find(name))) + if (Defined *sym = dyn_cast_or_null(ctx.symtab->find(name))) sym->isUsedInRegularObj = true; // Mark -init and -fini symbols so that the LTO doesn't eliminate them. - if (Symbol *sym = dyn_cast_or_null(symtab.find(ctx.arg.init))) + if (Symbol *sym = dyn_cast_or_null(ctx.symtab->find(ctx.arg.init))) sym->isUsedInRegularObj = true; - if (Symbol *sym = dyn_cast_or_null(symtab.find(ctx.arg.fini))) + if (Symbol *sym = dyn_cast_or_null(ctx.symtab->find(ctx.arg.fini))) sym->isUsedInRegularObj = true; // If any of our inputs are bitcode files, the LTO code generator may create @@ -2990,7 +2991,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // name "foo@ver1") rather do harm, so we don't call this if -r is given. if (!ctx.arg.relocatable) { llvm::TimeTraceScope timeScope("Process symbol versions"); - symtab.scanVersionScript(); + ctx.symtab->scanVersionScript(); } // Skip the normal linked output if some LTO options are specified. diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp index 9caff0bbe2b630..3f4f479785fd92 100644 --- a/lld/ELF/ICF.cpp +++ b/lld/ELF/ICF.cpp @@ -468,7 +468,7 @@ template void ICF::run() { // cannot be merged with the later computeIsPreemptible() pass which is used // by scanRelocations(). if (ctx.arg.hasDynSymTab) - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) sym->isPreemptible = computeIsPreemptible(*sym); // Two text sections may have identical content and relocations but different @@ -568,7 +568,7 @@ template void ICF::run() { d->folded = true; } }; - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) fold(sym); parallelForEach(ctx.objectFiles, [&](ELFFileBase *file) { for (Symbol *sym : file->getLocalSymbols()) diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 05c09b84043e27..8d5234d9bb87bb 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -667,10 +667,10 @@ template void ObjFile::parse(bool ignoreComdats) { if (flag && flag != GRP_COMDAT) fatal(toString(this) + ": unsupported SHT_GROUP format"); - bool keepGroup = - (flag & GRP_COMDAT) == 0 || ignoreComdats || - symtab.comdatGroups.try_emplace(CachedHashStringRef(signature), this) - .second; + bool keepGroup = (flag & GRP_COMDAT) == 0 || ignoreComdats || + ctx.symtab->comdatGroups + .try_emplace(CachedHashStringRef(signature), this) + .second; if (keepGroup) { if (!ctx.arg.resolveGroups) this->sections[i] = createInputSection( @@ -817,8 +817,8 @@ void ObjFile::initializeSections(bool ignoreComdats, ArrayRef entries = cantFail(obj.template getSectionContentsAsArray(sec)); if ((entries[0] & GRP_COMDAT) == 0 || ignoreComdats || - symtab.comdatGroups.find(CachedHashStringRef(signature))->second == - this) + ctx.symtab->comdatGroups.find(CachedHashStringRef(signature)) + ->second == this) selectedGroups.push_back(entries); break; } @@ -1130,7 +1130,8 @@ void ObjFile::initializeSymbols(const object::ELFFile &obj) { // Some entries have been filled by LazyObjFile. for (size_t i = firstGlobal, end = eSyms.size(); i != end; ++i) if (!symbols[i]) - symbols[i] = symtab.insert(CHECK(eSyms[i].getName(stringTable), this)); + symbols[i] = + ctx.symtab->insert(CHECK(eSyms[i].getName(stringTable), this)); // Perform symbol resolution on non-local symbols. SmallVector undefineds; @@ -1508,7 +1509,7 @@ template void SharedFile::parse() { DenseMap::iterator it; bool wasInserted; std::tie(it, wasInserted) = - symtab.soNames.try_emplace(CachedHashStringRef(soName), this); + ctx.symtab->soNames.try_emplace(CachedHashStringRef(soName), this); // If a DSO appears more than once on the command line with and without // --as-needed, --no-as-needed takes precedence over --as-needed because a @@ -1574,7 +1575,7 @@ template void SharedFile::parse() { name = saver().save( (name + "@" + verName).toStringRef(versionedNameBuffer)); } - Symbol *s = symtab.addSymbol( + Symbol *s = ctx.symtab->addSymbol( Undefined{this, name, sym.getBinding(), sym.st_other, sym.getType()}); s->exportDynamic = true; if (sym.getBinding() != STB_WEAK && @@ -1598,7 +1599,7 @@ template void SharedFile::parse() { uint32_t alignment = getAlignment(sections, sym); if (ver == idx) { - auto *s = symtab.addSymbol( + auto *s = ctx.symtab->addSymbol( SharedSymbol{*this, name, sym.getBinding(), sym.st_other, sym.getType(), sym.st_value, sym.st_size, alignment}); s->dsoDefined = true; @@ -1616,7 +1617,7 @@ template void SharedFile::parse() { reinterpret_cast(verdefs[idx])->getAux()->vda_name; versionedNameBuffer.clear(); name = (name + "@" + verName).toStringRef(versionedNameBuffer); - auto *s = symtab.addSymbol( + auto *s = ctx.symtab->addSymbol( SharedSymbol{*this, saver().save(name), sym.getBinding(), sym.st_other, sym.getType(), sym.st_value, sym.st_size, alignment}); s->dsoDefined = true; @@ -1751,7 +1752,7 @@ createBitcodeSymbol(Symbol *&sym, const std::vector &keptComdats, // this way LTO can reference the same string saver's copy rather than // keeping copies of its own. objSym.Name = uniqueSaver().save(objSym.getName()); - sym = symtab.insert(objSym.getName()); + sym = ctx.symtab->insert(objSym.getName()); } int c = objSym.getComdatIndex(); @@ -1778,7 +1779,7 @@ void BitcodeFile::parse() { for (std::pair s : obj->getComdatTable()) { keptComdats.push_back( s.second == Comdat::NoDeduplicate || - symtab.comdatGroups.try_emplace(CachedHashStringRef(s.first), this) + ctx.symtab->comdatGroups.try_emplace(CachedHashStringRef(s.first), this) .second); } @@ -1810,7 +1811,7 @@ void BitcodeFile::parseLazy() { // keeping copies of its own. irSym.Name = uniqueSaver().save(irSym.getName()); if (!irSym.isUndefined()) { - auto *sym = symtab.insert(irSym.getName()); + auto *sym = ctx.symtab->insert(irSym.getName()); sym->resolve(LazySymbol{*this}); symbols[i] = sym; } @@ -1847,15 +1848,15 @@ void BinaryFile::parse() { llvm::StringSaver &saver = lld::saver(); - symtab.addAndCheckDuplicate(Defined{this, saver.save(s + "_start"), - STB_GLOBAL, STV_DEFAULT, STT_OBJECT, 0, 0, - section}); - symtab.addAndCheckDuplicate(Defined{this, saver.save(s + "_end"), STB_GLOBAL, - STV_DEFAULT, STT_OBJECT, data.size(), 0, - section}); - symtab.addAndCheckDuplicate(Defined{this, saver.save(s + "_size"), STB_GLOBAL, - STV_DEFAULT, STT_OBJECT, data.size(), 0, - nullptr}); + ctx.symtab->addAndCheckDuplicate(Defined{this, saver.save(s + "_start"), + STB_GLOBAL, STV_DEFAULT, STT_OBJECT, + 0, 0, section}); + ctx.symtab->addAndCheckDuplicate(Defined{this, saver.save(s + "_end"), + STB_GLOBAL, STV_DEFAULT, STT_OBJECT, + data.size(), 0, section}); + ctx.symtab->addAndCheckDuplicate(Defined{this, saver.save(s + "_size"), + STB_GLOBAL, STV_DEFAULT, STT_OBJECT, + data.size(), 0, nullptr}); } InputFile *elf::createInternalFile(StringRef name) { @@ -1902,7 +1903,7 @@ template void ObjFile::parseLazy() { for (size_t i = firstGlobal, end = eSyms.size(); i != end; ++i) { if (eSyms[i].st_shndx == SHN_UNDEF) continue; - symbols[i] = symtab.insert(CHECK(eSyms[i].getName(stringTable), this)); + symbols[i] = ctx.symtab->insert(CHECK(eSyms[i].getName(stringTable), this)); symbols[i]->resolve(LazySymbol{*this}); if (!lazy) break; diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index e9985bbae4959e..54a214e01b0892 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -1129,7 +1129,7 @@ static void switchMorestackCallsToMorestackNonSplit( // If the target adjusted a function's prologue, all calls to // __morestack inside that function should be switched to // __morestack_non_split. - Symbol *moreStackNonSplit = symtab.find("__morestack_non_split"); + Symbol *moreStackNonSplit = ctx.symtab->find("__morestack_non_split"); if (!moreStackNonSplit) { error("mixing split-stack objects requires a definition of " "__morestack_non_split"); diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index 60ab8a9125977c..d5f9171ade739a 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -202,7 +202,7 @@ BitcodeCompiler::BitcodeCompiler(Ctx &ctx) : ctx(ctx) { // Initialize usedStartStop. if (ctx.bitcodeFiles.empty()) return; - for (Symbol *sym : symtab.getSymbols()) { + for (Symbol *sym : ctx.symtab->getSymbols()) { if (sym->isPlaceholder()) continue; StringRef s = sym->getName(); diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 55e5f23d041f8f..1ff33366c78976 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -230,7 +230,7 @@ void LinkerScript::addSymbol(SymbolAssignment *cmd) { Defined newSym(createInternalFile(cmd->location), cmd->name, STB_GLOBAL, visibility, value.type, symValue, 0, sec); - Symbol *sym = symtab.insert(cmd->name); + Symbol *sym = ctx.symtab->insert(cmd->name); sym->mergeProperties(newSym); newSym.overwrite(*sym); sym->isUsedInRegularObj = true; @@ -249,7 +249,7 @@ static void declareSymbol(SymbolAssignment *cmd) { // If the symbol is already defined, its order is 0 (with absence indicating // 0); otherwise it's assigned the order of the SymbolAssignment. - Symbol *sym = symtab.insert(cmd->name); + Symbol *sym = ctx.symtab->insert(cmd->name); if (!sym->isDefined()) ctx.scriptSymOrder.insert({sym, cmd->symOrder}); @@ -1682,7 +1682,7 @@ ExprValue LinkerScript::getSymbolValue(StringRef name, const Twine &loc) { return 0; } - if (Symbol *sym = symtab.find(name)) { + if (Symbol *sym = ctx.symtab->find(name)) { if (auto *ds = dyn_cast(sym)) { ExprValue v{ds->section, false, ds->value, loc}; // Retain the original st_type, so that the alias will get the same @@ -1781,8 +1781,8 @@ void LinkerScript::checkFinalScriptConditions() const { void LinkerScript::addScriptReferencedSymbolsToSymTable() { // Some symbols (such as __ehdr_start) are defined lazily only when there // are undefined symbols for them, so we add these to trigger that logic. - auto reference = [](StringRef name) { - Symbol *sym = symtab.addUnusedUndefined(name); + auto reference = [&ctx = ctx](StringRef name) { + Symbol *sym = ctx.symtab->addUnusedUndefined(name); sym->isUsedInRegularObj = true; sym->referenced = true; }; @@ -1811,6 +1811,6 @@ void LinkerScript::addScriptReferencedSymbolsToSymTable() { } bool LinkerScript::shouldAddProvideSym(StringRef symName) { - Symbol *sym = symtab.find(symName); + Symbol *sym = elf::ctx.symtab->find(symName); return sym && !sym->isDefined() && !sym->isCommon(); } diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp index 60e62c0cab767e..b9a4e392a507a0 100644 --- a/lld/ELF/MarkLive.cpp +++ b/lld/ELF/MarkLive.cpp @@ -219,7 +219,7 @@ template void MarkLive::run() { // Preserve externally-visible symbols if the symbols defined by this // file can interpose other ELF file's symbols at runtime. - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (sym->includeInDynsym() && sym->partition == partition) markSymbol(sym); @@ -229,16 +229,16 @@ template void MarkLive::run() { return; } - markSymbol(symtab.find(ctx.arg.entry)); - markSymbol(symtab.find(ctx.arg.init)); - markSymbol(symtab.find(ctx.arg.fini)); + markSymbol(ctx.symtab->find(ctx.arg.entry)); + markSymbol(ctx.symtab->find(ctx.arg.init)); + markSymbol(ctx.symtab->find(ctx.arg.fini)); for (StringRef s : ctx.arg.undefined) - markSymbol(symtab.find(s)); + markSymbol(ctx.symtab->find(s)); for (StringRef s : ctx.script->referencedSymbols) - markSymbol(symtab.find(s)); - for (auto [symName, _] : symtab.cmseSymMap) { - markSymbol(symtab.cmseSymMap[symName].sym); - markSymbol(symtab.cmseSymMap[symName].acleSeSym); + markSymbol(ctx.symtab->find(s)); + for (auto [symName, _] : ctx.symtab->cmseSymMap) { + markSymbol(ctx.symtab->cmseSymMap[symName].sym); + markSymbol(ctx.symtab->cmseSymMap[symName].acleSeSym); } // Mark .eh_frame sections as live because there are usually no relocations @@ -350,8 +350,8 @@ template void MarkLive::moveToMain() { for (InputSectionBase *sec : ctx.inputSections) { if (!sec->isLive() || !isValidCIdentifier(sec->name)) continue; - if (symtab.find(("__start_" + sec->name).str()) || - symtab.find(("__stop_" + sec->name).str())) + if (ctx.symtab->find(("__start_" + sec->name).str()) || + ctx.symtab->find(("__stop_" + sec->name).str())) enqueue(sec, 0); } @@ -366,7 +366,7 @@ template void elf::markLive() { // If --gc-sections is not given, retain all input sections. if (!ctx.arg.gcSections) { // If a DSO defines a symbol referenced in a regular object, it is needed. - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (auto *s = dyn_cast(sym)) if (s->isUsedInRegularObj && !s->isWeak()) cast(s->file)->isNeeded = true; diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 3cc65150988a32..e0181f0809cc5d 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -295,7 +295,7 @@ static SmallSet getSymbolsAt(SharedSymbol &ss) { s.getType() == STT_TLS || s.st_value != ss.value) continue; StringRef name = check(s.getName(file.getStringTable())); - Symbol *sym = symtab.find(name); + Symbol *sym = ctx.symtab->find(name); if (auto *alias = dyn_cast_or_null(sym)) ret.insert(alias); } @@ -545,7 +545,7 @@ static std::string maybeReportDiscarded(Undefined &sym) { // If the discarded section is a COMDAT. StringRef signature = file->getShtGroupSignature(objSections, elfSec); if (const InputFile *prevailing = - symtab.comdatGroups.lookup(CachedHashStringRef(signature))) { + ctx.symtab->comdatGroups.lookup(CachedHashStringRef(signature))) { msg += "\n>>> section group signature: " + signature.str() + "\n>>> prevailing definition is in " + toString(prevailing); if (sym.nonPrevailing) { @@ -618,7 +618,7 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym, return s; // If in the symbol table and not undefined. - if (const Symbol *s = symtab.find(newName)) + if (const Symbol *s = ctx.symtab->find(newName)) if (!s->isUndefined()) return s; @@ -667,7 +667,7 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym, for (auto &it : map) if (name.equals_insensitive(it.first)) return it.second; - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (!sym->isUndefined() && name.equals_insensitive(sym->getName())) return sym; @@ -693,7 +693,7 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym, break; } if (!s) - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) if (canSuggestExternCForCXX(name, sym->getName())) { s = sym; break; @@ -1870,7 +1870,7 @@ void elf::postScanRelocations() { } assert(ctx.symAux.size() == 1); - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) fn(*sym); // Local symbols may need the aforementioned non-preemptible ifunc and GOT @@ -2384,7 +2384,7 @@ bool elf::hexagonNeedsTLSSymbol(ArrayRef outputSections) { } void elf::hexagonTLSSymbolUpdate(ArrayRef outputSections) { - Symbol *sym = symtab.find("__tls_get_addr"); + Symbol *sym = ctx.symtab->find("__tls_get_addr"); if (!sym) return; bool needEntry = true; diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 12cd905d5c1b62..b16b2e56473adc 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -1581,7 +1581,7 @@ Expr ScriptParser::readPrimary() { // script, it must happen before this DEFINED. auto order = ctx.scriptSymOrderCounter++; return [=, &ctx = this->ctx] { - Symbol *s = symtab.find(name); + Symbol *s = ctx.symtab->find(name); return s && s->isDefined() && ctx.scriptSymOrder.lookup(s) < order ? 1 : 0; }; diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp index 4a4b4d3deed409..74fa66e6d1182c 100644 --- a/lld/ELF/SymbolTable.cpp +++ b/lld/ELF/SymbolTable.cpp @@ -29,8 +29,6 @@ using namespace llvm::ELF; using namespace lld; using namespace lld::elf; -SymbolTable elf::symtab; - void SymbolTable::wrap(Symbol *sym, Symbol *real, Symbol *wrap) { // Redirect __real_foo to the original foo and foo to the original __wrap_foo. int &idx1 = symMap[CachedHashStringRef(sym->getName())]; diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h index 269f7f284bc734..c0bc73502bbe60 100644 --- a/lld/ELF/SymbolTable.h +++ b/lld/ELF/SymbolTable.h @@ -104,8 +104,6 @@ class SymbolTable { std::optional>> demangledSyms; }; -LLVM_LIBRARY_VISIBILITY extern SymbolTable symtab; - } // namespace lld::elf #endif diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index a736b5d340fcc5..ce31c379ab1829 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1509,10 +1509,10 @@ DynamicSection::computeContents() { addInt(DT_FINI_ARRAYSZ, ctx.out.finiArray->size); } - if (Symbol *b = symtab.find(ctx.arg.init)) + if (Symbol *b = ctx.symtab->find(ctx.arg.init)) if (b->isDefined()) addInt(DT_INIT, b->getVA()); - if (Symbol *b = symtab.find(ctx.arg.fini)) + if (Symbol *b = ctx.symtab->find(ctx.arg.fini)) if (b->isDefined()) addInt(DT_FINI, b->getVA()); } @@ -1692,9 +1692,9 @@ void RelocationBaseSection::finalizeContents() { } } -void DynamicReloc::computeRaw(SymbolTableBaseSection *symtab) { +void DynamicReloc::computeRaw(SymbolTableBaseSection *symt) { r_offset = getOffset(); - r_sym = getSymIndex(symtab); + r_sym = getSymIndex(symt); addend = computeAddend(); kind = AddendOnly; // Catch errors } @@ -2327,8 +2327,9 @@ SymtabShndxSection::SymtabShndxSection() void SymtabShndxSection::writeTo(uint8_t *buf) { // We write an array of 32 bit values, where each value has 1:1 association - // with an entry in .symtab. If the corresponding entry contains SHN_XINDEX, - // we need to write actual index, otherwise, we must write SHN_UNDEF(0). + // with an entry in ctx.in.symTab if the corresponding entry contains + // SHN_XINDEX, we need to write actual index, otherwise, we must write + // SHN_UNDEF(0). buf += 4; // Ignore .symtab[0] entry. for (const SymbolTableEntry &entry : ctx.in.symTab->getSymbols()) { if (!getCommonSec(entry.sym) && getSymSectionIndex(entry.sym) == SHN_XINDEX) @@ -4640,7 +4641,7 @@ static OutputSection *findSection(StringRef name) { static Defined *addOptionalRegular(StringRef name, SectionBase *sec, uint64_t val, uint8_t stOther = STV_HIDDEN) { - Symbol *s = symtab.find(name); + Symbol *s = ctx.symtab->find(name); if (!s || s->isDefined() || s->isCommon()) return nullptr; diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 6d0634e0a16e90..34654a2c57846b 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -459,7 +459,7 @@ class DynamicReloc { /// address/the address of the corresponding GOT entry/etc. int64_t computeAddend() const; - void computeRaw(SymbolTableBaseSection *symtab); + void computeRaw(SymbolTableBaseSection *symt); Symbol *sym; const OutputSection *outputSec = nullptr; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 49a319c643b387..90c8d081b702fa 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -141,7 +141,7 @@ void elf::copySectionsIntoPartitions() { static Defined *addOptionalRegular(StringRef name, SectionBase *sec, uint64_t val, uint8_t stOther = STV_HIDDEN) { - Symbol *s = symtab.find(name); + Symbol *s = ctx.symtab->find(name); if (!s || s->isDefined() || s->isCommon()) return nullptr; @@ -158,8 +158,8 @@ void elf::addReservedSymbols() { if (ctx.arg.emachine == EM_MIPS) { auto addAbsolute = [](StringRef name) { Symbol *sym = - symtab.addSymbol(Defined{ctx.internalFile, name, STB_GLOBAL, - STV_HIDDEN, STT_NOTYPE, 0, 0, nullptr}); + ctx.symtab->addSymbol(Defined{ctx.internalFile, name, STB_GLOBAL, + STV_HIDDEN, STT_NOTYPE, 0, 0, nullptr}); sym->isUsedInRegularObj = true; return cast(sym); }; @@ -172,14 +172,14 @@ void elf::addReservedSymbols() { // On MIPS O32 ABI, _gp_disp is a magic symbol designates offset between // start of function and 'gp' pointer into GOT. - if (symtab.find("_gp_disp")) + if (ctx.symtab->find("_gp_disp")) ctx.sym.mipsGpDisp = addAbsolute("_gp_disp"); // The __gnu_local_gp is a magic symbol equal to the current value of 'gp' // pointer. This symbol is used in the code generated by .cpload pseudo-op // in case of using -mno-shared option. // https://sourceware.org/ml/binutils/2004-12/msg00094.html - if (symtab.find("__gnu_local_gp")) + if (ctx.symtab->find("__gnu_local_gp")) ctx.sym.mipsLocalGp = addAbsolute("__gnu_local_gp"); } else if (ctx.arg.emachine == EM_PPC) { // glibc *crt1.o has a undefined reference to _SDA_BASE_. Since we don't @@ -200,7 +200,7 @@ void elf::addReservedSymbols() { StringRef gotSymName = (ctx.arg.emachine == EM_PPC64) ? ".TOC." : "_GLOBAL_OFFSET_TABLE_"; - if (Symbol *s = symtab.find(gotSymName)) { + if (Symbol *s = ctx.symtab->find(gotSymName)) { if (s->isDefined()) { error(toString(s->file) + " cannot redefine linker defined symbol '" + gotSymName + "'"); @@ -273,7 +273,7 @@ static void demoteDefined(Defined &sym, DenseMap &map) { static void demoteSymbolsAndComputeIsPreemptible() { llvm::TimeTraceScope timeScope("Demote symbols"); DenseMap> sectionIndexMap; - for (Symbol *sym : symtab.getSymbols()) { + for (Symbol *sym : ctx.symtab->getSymbols()) { if (auto *d = dyn_cast(sym)) { if (d->section && !d->section->isLive()) demoteDefined(*d, sectionIndexMap[d->file]); @@ -1106,7 +1106,7 @@ static DenseMap buildSectionOrder() { // We want both global and local symbols. We get the global ones from the // symbol table and iterate the object files for the local ones. - for (Symbol *sym : symtab.getSymbols()) + for (Symbol *sym : ctx.symtab->getSymbols()) addSym(*sym); for (ELFFileBase *file : ctx.objectFiles) @@ -1724,7 +1724,7 @@ template void Writer::finalizeSections() { // Even the author of gold doesn't remember why gold behaves that way. // https://sourceware.org/ml/binutils/2002-03/msg00360.html if (ctx.mainPart->dynamic->parent) { - Symbol *s = symtab.addSymbol(Defined{ + Symbol *s = ctx.symtab->addSymbol(Defined{ ctx.internalFile, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE, /*value=*/0, /*size=*/0, ctx.mainPart->dynamic.get()}); s->isUsedInRegularObj = true; @@ -1745,7 +1745,7 @@ template void Writer::finalizeSections() { // Set riscvGlobalPointer to be used by the optional global pointer // relaxation. if (ctx.arg.relaxGP) { - Symbol *s = symtab.find("__global_pointer$"); + Symbol *s = ctx.symtab->find("__global_pointer$"); if (s && s->isDefined()) ctx.sym.riscvGlobalPointer = cast(s); } @@ -1764,7 +1764,7 @@ template void Writer::finalizeSections() { // 2) is special cased in @tpoff computation. To satisfy 1), we define it // as an absolute symbol of zero. This is different from GNU linkers which // define _TLS_MODULE_BASE_ relative to the first TLS section. - Symbol *s = symtab.find("_TLS_MODULE_BASE_"); + Symbol *s = ctx.symtab->find("_TLS_MODULE_BASE_"); if (s && s->isUndefined()) { s->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN, STT_TLS, /*value=*/0, 0, @@ -1832,7 +1832,7 @@ template void Writer::finalizeSections() { for (SharedFile *file : ctx.sharedFiles) { bool allNeededIsKnown = llvm::all_of(file->dtNeeded, [&](StringRef needed) { - return symtab.soNames.count(CachedHashStringRef(needed)); + return ctx.symtab->soNames.count(CachedHashStringRef(needed)); }); if (!allNeededIsKnown) continue; @@ -1857,7 +1857,7 @@ template void Writer::finalizeSections() { llvm::TimeTraceScope timeScope("Add symbols to symtabs"); // Now that we have defined all possible global symbols including linker- // synthesized ones. Visit all symbols to give the finishing touches. - for (Symbol *sym : symtab.getSymbols()) { + for (Symbol *sym : ctx.symtab->getSymbols()) { if (!sym->isUsedInRegularObj || !includeInSymtab(*sym)) continue; if (!ctx.arg.relocatable) @@ -1922,8 +1922,8 @@ template void Writer::finalizeSections() { if (ctx.arg.emachine == EM_HEXAGON && hexagonNeedsTLSSymbol(ctx.outputSections)) { Symbol *sym = - symtab.addSymbol(Undefined{ctx.internalFile, "__tls_get_addr", - STB_GLOBAL, STV_DEFAULT, STT_NOTYPE}); + ctx.symtab->addSymbol(Undefined{ctx.internalFile, "__tls_get_addr", + STB_GLOBAL, STV_DEFAULT, STT_NOTYPE}); sym->isPreemptible = true; ctx.partitions[0].dynSymTab->addSymbol(sym); } @@ -2701,7 +2701,7 @@ template void Writer::checkSections() { // 5. the address 0. static uint64_t getEntryAddr() { // Case 1, 2 or 3 - if (Symbol *b = symtab.find(ctx.arg.entry)) + if (Symbol *b = ctx.symtab->find(ctx.arg.entry)) return b->getVA(); // Case 4 diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index a9e8406608ff31..a1d00f2d2c0cd1 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -54,7 +54,7 @@ class UnwindPlan { public: class Row { public: - class RegisterLocation { + class AbstractRegisterLocation { public: enum RestoreType { unspecified, // not specified, we may be able to assume this @@ -72,11 +72,11 @@ class UnwindPlan { isConstant // reg = constant }; - RegisterLocation() : m_location() {} + AbstractRegisterLocation() : m_location() {} - bool operator==(const RegisterLocation &rhs) const; + bool operator==(const AbstractRegisterLocation &rhs) const; - bool operator!=(const RegisterLocation &rhs) const { + bool operator!=(const AbstractRegisterLocation &rhs) const { return !(*this == rhs); } @@ -337,10 +337,10 @@ class UnwindPlan { bool operator==(const Row &rhs) const; bool GetRegisterInfo(uint32_t reg_num, - RegisterLocation ®ister_location) const; + AbstractRegisterLocation ®ister_location) const; void SetRegisterInfo(uint32_t reg_num, - const RegisterLocation register_location); + const AbstractRegisterLocation register_location); void RemoveRegisterInfo(uint32_t reg_num); @@ -398,7 +398,7 @@ class UnwindPlan { lldb::addr_t base_addr) const; protected: - typedef std::map collection; + typedef std::map collection; lldb::addr_t m_offset = 0; // Offset into the function for this row FAValue m_cfa_value; diff --git a/lldb/include/lldb/Target/ABI.h b/lldb/include/lldb/Target/ABI.h index 7b646d743346b7..dd941d1c905c15 100644 --- a/lldb/include/lldb/Target/ABI.h +++ b/lldb/include/lldb/Target/ABI.h @@ -102,9 +102,9 @@ class ABI : public PluginInterface { virtual bool RegisterIsVolatile(const RegisterInfo *reg_info) = 0; - virtual bool - GetFallbackRegisterLocation(const RegisterInfo *reg_info, - UnwindPlan::Row::RegisterLocation &unwind_regloc); + virtual bool GetFallbackRegisterLocation( + const RegisterInfo *reg_info, + UnwindPlan::Row::AbstractRegisterLocation &unwind_regloc); // Should take a look at a call frame address (CFA) which is just the stack // pointer value upon entry to a function. ABIs usually impose alignment diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h index ef8ae884038663..3be9eb5c5c70fc 100644 --- a/lldb/include/lldb/Target/RegisterContextUnwind.h +++ b/lldb/include/lldb/Target/RegisterContextUnwind.h @@ -84,7 +84,7 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { // past the top (end) of the stack }; - // UnwindLLDB needs to pass around references to RegisterLocations + // UnwindLLDB needs to pass around references to ConcreteRegisterLocations friend class UnwindLLDB; // Returns true if we have an unwind loop -- the same stack frame unwinding @@ -135,29 +135,28 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { // preserved a register that this // function didn't modify/use. // - // The RegisterLocation type may be set to eRegisterNotAvailable -- this will - // happen for a volatile register - // being queried mid-stack. Instead of floating frame 0's contents of that - // register up the stack (which may - // or may not be the value of that reg when the function was executing), we - // won't return any value. + // The ConcreteRegisterLocation type may be set to eRegisterNotAvailable -- + // this will happen for a volatile register being queried mid-stack. Instead + // of floating frame 0's contents of that register up the stack (which may or + // may not be the value of that reg when the function was executing), we won't + // return any value. // // If a non-volatile register (a "preserved" register) is requested mid-stack // and no frames "below" the requested // stack have saved the register anywhere, it is safe to assume that frame 0's // register values are still the same // as the requesting frame's. - lldb_private::UnwindLLDB::RegisterSearchResult - SavedLocationForRegister(uint32_t lldb_regnum, - lldb_private::UnwindLLDB::RegisterLocation ®loc); + lldb_private::UnwindLLDB::RegisterSearchResult SavedLocationForRegister( + uint32_t lldb_regnum, + lldb_private::UnwindLLDB::ConcreteRegisterLocation ®loc); bool ReadRegisterValueFromRegisterLocation( - lldb_private::UnwindLLDB::RegisterLocation regloc, + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc, const lldb_private::RegisterInfo *reg_info, lldb_private::RegisterValue &value); bool WriteRegisterValueToRegisterLocation( - lldb_private::UnwindLLDB::RegisterLocation regloc, + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc, const lldb_private::RegisterInfo *reg_info, const lldb_private::RegisterValue &value); @@ -249,7 +248,7 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { uint32_t m_frame_number; // What stack frame this RegisterContext is - std::map + std::map m_registers; // where to find reg values for this frame lldb_private::UnwindLLDB &m_parent_unwind; // The UnwindLLDB that is creating diff --git a/lldb/include/lldb/Target/UnwindLLDB.h b/lldb/include/lldb/Target/UnwindLLDB.h index f80212cde3cab0..f2f65e67a76406 100644 --- a/lldb/include/lldb/Target/UnwindLLDB.h +++ b/lldb/include/lldb/Target/UnwindLLDB.h @@ -38,7 +38,10 @@ class UnwindLLDB : public lldb_private::Unwind { protected: friend class lldb_private::RegisterContextUnwind; - struct RegisterLocation { + /// An UnwindPlan::Row::AbstractRegisterLocation, combined with the register + /// context and memory for a specific stop point, is used to create a + /// ConcreteRegisterLocation. + struct ConcreteRegisterLocation { enum RegisterLocationTypes { eRegisterNotSaved = 0, // register was not preserved by callee. If // volatile reg, is unavailable @@ -90,7 +93,8 @@ class UnwindLLDB : public lldb_private::Unwind { // Iterate over the RegisterContextUnwind's in our m_frames vector, look for // the first one that has a saved location for this reg. bool SearchForSavedLocationForRegister( - uint32_t lldb_regnum, lldb_private::UnwindLLDB::RegisterLocation ®loc, + uint32_t lldb_regnum, + lldb_private::UnwindLLDB::ConcreteRegisterLocation ®loc, uint32_t starting_frame_num, bool pc_register); /// Provide the list of user-specified trap handler functions diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp index cbfca1ef6a76b0..ac2d1988a176cc 100644 --- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp +++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp @@ -644,7 +644,7 @@ bool ABISysV_s390x::CreateDefaultUnwindPlan(UnwindPlan &unwind_plan) { bool ABISysV_s390x::GetFallbackRegisterLocation( const RegisterInfo *reg_info, - UnwindPlan::Row::RegisterLocation &unwind_regloc) { + UnwindPlan::Row::AbstractRegisterLocation &unwind_regloc) { // If a volatile register is being requested, we don't want to forward the // next frame's register contents up the stack -- the register is not // retrievable at this frame. diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.h b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.h index f6c248dc59baaa..ecf3e3906dd7b9 100644 --- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.h +++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.h @@ -43,7 +43,8 @@ class ABISysV_s390x : public lldb_private::RegInfoBasedABI { bool GetFallbackRegisterLocation( const lldb_private::RegisterInfo *reg_info, - lldb_private::UnwindPlan::Row::RegisterLocation &unwind_regloc) override; + lldb_private::UnwindPlan::Row::AbstractRegisterLocation &unwind_regloc) + override; bool CallFrameAddressIsValid(lldb::addr_t cfa) override { // Make sure the stack call frame addresses are 8 byte aligned diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp index 80b27571f43d55..bf552e19742c4a 100644 --- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp @@ -324,7 +324,7 @@ void NativeProcessFreeBSD::MonitorSIGTRAP(lldb::pid_t pid) { auto thread_info = m_threads_stepping_with_breakpoint.find(thread->GetID()); if (thread_info != m_threads_stepping_with_breakpoint.end() && - threads_info->second == regctx.GetPC()) { + thread_info->second == regctx.GetPC()) { thread->SetStoppedByTrace(); Status brkpt_error = RemoveBreakpoint(thread_info->second); if (brkpt_error.Fail()) diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp index 3977dc3a6d67c5..9e78ba8174e3d5 100644 --- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp +++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp @@ -614,7 +614,7 @@ bool SymbolFileBreakpad::ParseCFIUnwindRow(llvm::StringRef unwind_rules, row.GetCFAValue().SetIsDWARFExpression(saved.data(), saved.size()); } else if (const RegisterInfo *info = ResolveRegisterOrRA(triple, resolver, lhs)) { - UnwindPlan::Row::RegisterLocation loc; + UnwindPlan::Row::AbstractRegisterLocation loc; loc.SetIsDWARFExpression(saved.data(), saved.size()); row.SetRegisterInfo(info->kinds[eRegisterKindLLDB], loc); } else @@ -766,7 +766,7 @@ SymbolFileBreakpad::ParseWinUnwindPlan(const Bookmark &bookmark, } llvm::ArrayRef saved = SaveAsDWARF(*it->second); - UnwindPlan::Row::RegisterLocation loc; + UnwindPlan::Row::AbstractRegisterLocation loc; loc.SetIsDWARFExpression(saved.data(), saved.size()); row_sp->SetRegisterInfo(info->kinds[eRegisterKindLLDB], loc); } diff --git a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp index eca78a9b3a04b4..5c846bafc24df7 100644 --- a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp +++ b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp @@ -97,7 +97,7 @@ bool UnwindAssembly_x86::AugmentUnwindPlanFromCallSite( first_row->GetCFAValue().GetOffset() != wordsize) { return false; } - UnwindPlan::Row::RegisterLocation first_row_pc_loc; + UnwindPlan::Row::AbstractRegisterLocation first_row_pc_loc; if (!first_row->GetRegisterInfo( pc_regnum.GetAsKind(unwind_plan.GetRegisterKind()), first_row_pc_loc) || @@ -126,7 +126,7 @@ bool UnwindAssembly_x86::AugmentUnwindPlanFromCallSite( // Get the register locations for eip/rip from the first & last rows. Are // they both CFA plus an offset? Is it the same offset? - UnwindPlan::Row::RegisterLocation last_row_pc_loc; + UnwindPlan::Row::AbstractRegisterLocation last_row_pc_loc; if (last_row->GetRegisterInfo( pc_regnum.GetAsKind(unwind_plan.GetRegisterKind()), last_row_pc_loc)) { diff --git a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp index 6bfaa54135a959..81b7f138fe7caa 100644 --- a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp +++ b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp @@ -915,7 +915,7 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly( addr_t current_func_text_offset = 0; int current_sp_bytes_offset_from_fa = 0; bool is_aligned = false; - UnwindPlan::Row::RegisterLocation initial_regloc; + UnwindPlan::Row::AbstractRegisterLocation initial_regloc; UnwindPlan::RowSP row(new UnwindPlan::Row); unwind_plan.SetPlanValidAddressRange(func_range); @@ -1051,7 +1051,7 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly( if (nonvolatile_reg_p(machine_regno) && machine_regno_to_lldb_regno(machine_regno, lldb_regno) && !saved_registers[machine_regno]) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; if (is_aligned) regloc.SetAtAFAPlusOffset(-current_sp_bytes_offset_from_fa); else @@ -1142,7 +1142,7 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly( !saved_registers[machine_regno]) { saved_registers[machine_regno] = true; - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // stack_offset for 'movq %r15, -80(%rbp)' will be 80. In the Row, we // want to express this as the offset from the FA. If the frame base is @@ -1234,7 +1234,7 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly( // determine the effcts of. Verify that the stack frame state // has been unwound to the same as it was at function entry to avoid // mis-identifying a JMP instruction as an epilogue. - UnwindPlan::Row::RegisterLocation sp, pc; + UnwindPlan::Row::AbstractRegisterLocation sp, pc; if (row->GetRegisterInfo(m_lldb_sp_regnum, sp) && row->GetRegisterInfo(m_lldb_ip_regnum, pc)) { // Any ret instruction variant is definitely indicative of an diff --git a/lldb/source/Symbol/ArmUnwindInfo.cpp b/lldb/source/Symbol/ArmUnwindInfo.cpp index 6bc3bd6cc5edfa..569e0f591cbafe 100644 --- a/lldb/source/Symbol/ArmUnwindInfo.cpp +++ b/lldb/source/Symbol/ArmUnwindInfo.cpp @@ -333,7 +333,7 @@ bool ArmUnwindInfo::GetUnwindPlan(Target &target, const Address &addr, } if (!have_location_for_pc) { - UnwindPlan::Row::RegisterLocation lr_location; + UnwindPlan::Row::AbstractRegisterLocation lr_location; if (row->GetRegisterInfo(dwarf_lr, lr_location)) row->SetRegisterInfo(dwarf_pc, lr_location); else diff --git a/lldb/source/Symbol/DWARFCallFrameInfo.cpp b/lldb/source/Symbol/DWARFCallFrameInfo.cpp index ff2610c9df2765..a743de596b8d8d 100644 --- a/lldb/source/Symbol/DWARFCallFrameInfo.cpp +++ b/lldb/source/Symbol/DWARFCallFrameInfo.cpp @@ -633,7 +633,7 @@ bool DWARFCallFrameInfo::FDEToUnwindPlan(dw_offset_t dwarf_offset, std::vector stack; - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; while (m_cfi_data.ValidOffset(offset) && offset < end_offset) { uint8_t inst = m_cfi_data.GetU8(&offset); uint8_t primary_opcode = inst & 0xC0; @@ -822,7 +822,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, int32_t data_align, lldb::offset_t &offset, UnwindPlan::Row &row) { - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; if (primary_opcode) { switch (primary_opcode) { @@ -852,7 +852,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, // except for the encoding and size of the register argument. uint32_t reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); int32_t op_offset = (int32_t)m_cfi_data.GetULEB128(&offset) * data_align; - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetAtCFAPlusOffset(op_offset); row.SetRegisterInfo(reg_num, reg_location); return true; @@ -864,7 +864,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, // number. The required action is to set the rule for the specified // register to undefined. uint32_t reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetUndefined(); row.SetRegisterInfo(reg_num, reg_location); return true; @@ -876,7 +876,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, // number. The required action is to set the rule for the specified // register to same value. uint32_t reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetSame(); row.SetRegisterInfo(reg_num, reg_location); return true; @@ -889,7 +889,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, // second register. uint32_t reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); uint32_t other_reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetInRegister(other_reg_num); row.SetRegisterInfo(reg_num, reg_location); return true; @@ -950,7 +950,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, uint32_t block_len = (uint32_t)m_cfi_data.GetULEB128(&offset); const uint8_t *block_data = static_cast(m_cfi_data.GetData(&offset, block_len)); - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetAtDWARFExpression(block_data, block_len); row.SetRegisterInfo(reg_num, reg_location); return true; @@ -964,7 +964,7 @@ bool DWARFCallFrameInfo::HandleCommonDwarfOpcode(uint8_t primary_opcode, // signed and factored. uint32_t reg_num = (uint32_t)m_cfi_data.GetULEB128(&offset); int32_t op_offset = (int32_t)m_cfi_data.GetSLEB128(&offset) * data_align; - UnwindPlan::Row::RegisterLocation reg_location; + UnwindPlan::Row::AbstractRegisterLocation reg_location; reg_location.SetAtCFAPlusOffset(op_offset); row.SetRegisterInfo(reg_num, reg_location); return true; diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp index 228d9a1072deca..d01a899e4f3c67 100644 --- a/lldb/source/Symbol/FuncUnwinders.cpp +++ b/lldb/source/Symbol/FuncUnwinders.cpp @@ -371,8 +371,8 @@ LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( UnwindPlan::RowSP b_first_row = b->GetRowAtIndex(0); if (a_first_row.get() && b_first_row.get()) { - UnwindPlan::Row::RegisterLocation a_pc_regloc; - UnwindPlan::Row::RegisterLocation b_pc_regloc; + UnwindPlan::Row::AbstractRegisterLocation a_pc_regloc; + UnwindPlan::Row::AbstractRegisterLocation b_pc_regloc; a_first_row->GetRegisterInfo(pc_reg_lldb_regnum, a_pc_regloc); b_first_row->GetRegisterInfo(pc_reg_lldb_regnum, b_pc_regloc); diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp index e2dbd81a82c84c..b5a9aa2094f54d 100644 --- a/lldb/source/Symbol/UnwindPlan.cpp +++ b/lldb/source/Symbol/UnwindPlan.cpp @@ -22,8 +22,8 @@ using namespace lldb; using namespace lldb_private; -bool UnwindPlan::Row::RegisterLocation:: -operator==(const UnwindPlan::Row::RegisterLocation &rhs) const { +bool UnwindPlan::Row::AbstractRegisterLocation::operator==( + const UnwindPlan::Row::AbstractRegisterLocation &rhs) const { if (m_type == rhs.m_type) { switch (m_type) { case unspecified: @@ -55,7 +55,7 @@ operator==(const UnwindPlan::Row::RegisterLocation &rhs) const { // This function doesn't copy the dwarf expression bytes; they must remain in // allocated memory for the lifespan of this UnwindPlan object. -void UnwindPlan::Row::RegisterLocation::SetAtDWARFExpression( +void UnwindPlan::Row::AbstractRegisterLocation::SetAtDWARFExpression( const uint8_t *opcodes, uint32_t len) { m_type = atDWARFExpression; m_location.expr.opcodes = opcodes; @@ -64,7 +64,7 @@ void UnwindPlan::Row::RegisterLocation::SetAtDWARFExpression( // This function doesn't copy the dwarf expression bytes; they must remain in // allocated memory for the lifespan of this UnwindPlan object. -void UnwindPlan::Row::RegisterLocation::SetIsDWARFExpression( +void UnwindPlan::Row::AbstractRegisterLocation::SetIsDWARFExpression( const uint8_t *opcodes, uint32_t len) { m_type = isDWARFExpression; m_location.expr.opcodes = opcodes; @@ -92,11 +92,9 @@ static void DumpDWARFExpr(Stream &s, llvm::ArrayRef expr, Thread *threa s.PutCString("dwarf-expr"); } -void UnwindPlan::Row::RegisterLocation::Dump(Stream &s, - const UnwindPlan *unwind_plan, - const UnwindPlan::Row *row, - Thread *thread, - bool verbose) const { +void UnwindPlan::Row::AbstractRegisterLocation::Dump( + Stream &s, const UnwindPlan *unwind_plan, const UnwindPlan::Row *row, + Thread *thread, bool verbose) const { switch (m_type) { case unspecified: if (verbose) @@ -255,7 +253,7 @@ UnwindPlan::Row::Row() : m_cfa_value(), m_afa_value(), m_register_locations() {} bool UnwindPlan::Row::GetRegisterInfo( uint32_t reg_num, - UnwindPlan::Row::RegisterLocation ®ister_location) const { + UnwindPlan::Row::AbstractRegisterLocation ®ister_location) const { collection::const_iterator pos = m_register_locations.find(reg_num); if (pos != m_register_locations.end()) { register_location = pos->second; @@ -277,7 +275,7 @@ void UnwindPlan::Row::RemoveRegisterInfo(uint32_t reg_num) { void UnwindPlan::Row::SetRegisterInfo( uint32_t reg_num, - const UnwindPlan::Row::RegisterLocation register_location) { + const UnwindPlan::Row::AbstractRegisterLocation register_location) { m_register_locations[reg_num] = register_location; } @@ -287,7 +285,7 @@ bool UnwindPlan::Row::SetRegisterLocationToAtCFAPlusOffset(uint32_t reg_num, if (!can_replace && m_register_locations.find(reg_num) != m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetAtCFAPlusOffset(offset); m_register_locations[reg_num] = reg_loc; return true; @@ -299,7 +297,7 @@ bool UnwindPlan::Row::SetRegisterLocationToIsCFAPlusOffset(uint32_t reg_num, if (!can_replace && m_register_locations.find(reg_num) != m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetIsCFAPlusOffset(offset); m_register_locations[reg_num] = reg_loc; return true; @@ -316,7 +314,7 @@ bool UnwindPlan::Row::SetRegisterLocationToUndefined( if (can_replace_only_if_unspecified && !pos->second.IsUnspecified()) return false; } - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetUndefined(); m_register_locations[reg_num] = reg_loc; return true; @@ -327,7 +325,7 @@ bool UnwindPlan::Row::SetRegisterLocationToUnspecified(uint32_t reg_num, if (!can_replace && m_register_locations.find(reg_num) != m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetUnspecified(); m_register_locations[reg_num] = reg_loc; return true; @@ -339,7 +337,7 @@ bool UnwindPlan::Row::SetRegisterLocationToRegister(uint32_t reg_num, if (!can_replace && m_register_locations.find(reg_num) != m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetInRegister(other_reg_num); m_register_locations[reg_num] = reg_loc; return true; @@ -350,7 +348,7 @@ bool UnwindPlan::Row::SetRegisterLocationToSame(uint32_t reg_num, if (must_replace && m_register_locations.find(reg_num) == m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetSame(); m_register_locations[reg_num] = reg_loc; return true; @@ -362,7 +360,7 @@ bool UnwindPlan::Row::SetRegisterLocationToIsConstant(uint32_t reg_num, if (!can_replace && m_register_locations.find(reg_num) != m_register_locations.end()) return false; - RegisterLocation reg_loc; + AbstractRegisterLocation reg_loc; reg_loc.SetIsConstant(constant); m_register_locations[reg_num] = reg_loc; return true; diff --git a/lldb/source/Target/ABI.cpp b/lldb/source/Target/ABI.cpp index 110b5c86fc4256..1a301d4cae7a4f 100644 --- a/lldb/source/Target/ABI.cpp +++ b/lldb/source/Target/ABI.cpp @@ -210,7 +210,7 @@ bool ABI::PrepareTrivialCall(Thread &thread, lldb::addr_t sp, bool ABI::GetFallbackRegisterLocation( const RegisterInfo *reg_info, - UnwindPlan::Row::RegisterLocation &unwind_regloc) { + UnwindPlan::Row::AbstractRegisterLocation &unwind_regloc) { // Did the UnwindPlan fail to give us the caller's stack pointer? The stack // pointer is defined to be the same as THIS frame's CFA, so return the CFA // value as the caller's stack pointer. This is true on x86-32/x86-64 at diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index a61228d092d898..b683ea7237de04 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -1108,14 +1108,14 @@ uint32_t RegisterContextUnwind::ConvertRegisterKindToRegisterNumber( } bool RegisterContextUnwind::ReadRegisterValueFromRegisterLocation( - lldb_private::UnwindLLDB::RegisterLocation regloc, + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc, const RegisterInfo *reg_info, RegisterValue &value) { if (!IsValid()) return false; bool success = false; switch (regloc.type) { - case UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext: { const RegisterInfo *other_reg_info = GetRegisterInfoAtIndex(regloc.location.register_number); @@ -1125,7 +1125,7 @@ bool RegisterContextUnwind::ReadRegisterValueFromRegisterLocation( success = m_thread.GetRegisterContext()->ReadRegister(other_reg_info, value); } break; - case UnwindLLDB::RegisterLocation::eRegisterInRegister: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister: { const RegisterInfo *other_reg_info = GetRegisterInfoAtIndex(regloc.location.register_number); @@ -1139,29 +1139,29 @@ bool RegisterContextUnwind::ReadRegisterValueFromRegisterLocation( success = GetNextFrame()->ReadRegister(other_reg_info, value); } } break; - case UnwindLLDB::RegisterLocation::eRegisterValueInferred: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred: success = value.SetUInt(regloc.location.inferred_value, reg_info->byte_size); break; - case UnwindLLDB::RegisterLocation::eRegisterNotSaved: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterNotSaved: break; - case UnwindLLDB::RegisterLocation::eRegisterSavedAtHostMemoryLocation: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtHostMemoryLocation: llvm_unreachable("FIXME debugger inferior function call unwind"); - case UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation: { Status error(ReadRegisterValueFromMemory( reg_info, regloc.location.target_memory_location, reg_info->byte_size, value)); success = error.Success(); } break; default: - llvm_unreachable("Unknown RegisterLocation type."); + llvm_unreachable("Unknown ConcreteRegisterLocation type."); } return success; } bool RegisterContextUnwind::WriteRegisterValueToRegisterLocation( - lldb_private::UnwindLLDB::RegisterLocation regloc, + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc, const RegisterInfo *reg_info, const RegisterValue &value) { if (!IsValid()) return false; @@ -1169,13 +1169,13 @@ bool RegisterContextUnwind::WriteRegisterValueToRegisterLocation( bool success = false; switch (regloc.type) { - case UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext: { const RegisterInfo *other_reg_info = GetRegisterInfoAtIndex(regloc.location.register_number); success = m_thread.GetRegisterContext()->WriteRegister(other_reg_info, value); } break; - case UnwindLLDB::RegisterLocation::eRegisterInRegister: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister: { const RegisterInfo *other_reg_info = GetRegisterInfoAtIndex(regloc.location.register_number); if (IsFrameZero()) { @@ -1185,19 +1185,19 @@ bool RegisterContextUnwind::WriteRegisterValueToRegisterLocation( success = GetNextFrame()->WriteRegister(other_reg_info, value); } } break; - case UnwindLLDB::RegisterLocation::eRegisterValueInferred: - case UnwindLLDB::RegisterLocation::eRegisterNotSaved: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterNotSaved: break; - case UnwindLLDB::RegisterLocation::eRegisterSavedAtHostMemoryLocation: + case UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtHostMemoryLocation: llvm_unreachable("FIXME debugger inferior function call unwind"); - case UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation: { + case UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation: { Status error(WriteRegisterValueToMemory( reg_info, regloc.location.target_memory_location, reg_info->byte_size, value)); success = error.Success(); } break; default: - llvm_unreachable("Unknown RegisterLocation type."); + llvm_unreachable("Unknown ConcreteRegisterLocation type."); } return success; } @@ -1259,14 +1259,15 @@ bool RegisterContextUnwind::IsTrapHandlerSymbol( enum UnwindLLDB::RegisterSearchResult RegisterContextUnwind::SavedLocationForRegister( - uint32_t lldb_regnum, lldb_private::UnwindLLDB::RegisterLocation ®loc) { + uint32_t lldb_regnum, + lldb_private::UnwindLLDB::ConcreteRegisterLocation ®loc) { RegisterNumber regnum(m_thread, eRegisterKindLLDB, lldb_regnum); Log *log = GetLog(LLDBLog::Unwind); // Have we already found this register location? if (!m_registers.empty()) { std::map::const_iterator + lldb_private::UnwindLLDB::ConcreteRegisterLocation>::const_iterator iterator; iterator = m_registers.find(regnum.GetAsKind(eRegisterKindLLDB)); if (iterator != m_registers.end()) { @@ -1279,7 +1280,7 @@ RegisterContextUnwind::SavedLocationForRegister( // Look through the available UnwindPlans for the register location. - UnwindPlan::Row::RegisterLocation unwindplan_regloc; + UnwindPlan::Row::AbstractRegisterLocation unwindplan_regloc; bool have_unwindplan_regloc = false; RegisterKind unwindplan_registerkind = kNumRegisterKinds; @@ -1353,7 +1354,7 @@ RegisterContextUnwind::SavedLocationForRegister( // signal was received, we should fetch the actual saved $pc // value instead of the Return Address register. // If $pc is not available, fall back to the RA reg. - UnwindPlan::Row::RegisterLocation scratch; + UnwindPlan::Row::AbstractRegisterLocation scratch; if (m_frame_type == eTrapHandlerFrame && active_row->GetRegisterInfo (pc_regnum.GetAsKind (unwindplan_registerkind), scratch)) { @@ -1404,9 +1405,9 @@ RegisterContextUnwind::SavedLocationForRegister( BehavesLikeZerothFrame()) { if (return_address_reg.GetAsKind(eRegisterKindLLDB) != LLDB_INVALID_REGNUM) { - lldb_private::UnwindLLDB::RegisterLocation new_regloc; - new_regloc.type = - UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext; + lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc; + new_regloc.type = UnwindLLDB::ConcreteRegisterLocation:: + eRegisterInLiveRegisterContext; new_regloc.location.register_number = return_address_reg.GetAsKind(eRegisterKindLLDB); m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc; @@ -1513,9 +1514,9 @@ RegisterContextUnwind::SavedLocationForRegister( if (IsFrameZero()) { // This is frame 0 - we should return the actual live register context // value - lldb_private::UnwindLLDB::RegisterLocation new_regloc; + lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc; new_regloc.type = - UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext; + UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext; new_regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB); m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc; regloc = new_regloc; @@ -1539,8 +1540,8 @@ RegisterContextUnwind::SavedLocationForRegister( // unwindplan_regloc has valid contents about where to retrieve the register if (unwindplan_regloc.IsUnspecified()) { - lldb_private::UnwindLLDB::RegisterLocation new_regloc = {}; - new_regloc.type = UnwindLLDB::RegisterLocation::eRegisterNotSaved; + lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc = {}; + new_regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterNotSaved; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc; UnwindLogMsg("save location for %s (%d) is unspecified, continue searching", regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB)); @@ -1564,7 +1565,7 @@ RegisterContextUnwind::SavedLocationForRegister( regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB)); return UnwindLLDB::RegisterSearchResult::eRegisterNotFound; } else { - regloc.type = UnwindLLDB::RegisterLocation::eRegisterInRegister; + regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister; regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB); m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg( @@ -1577,7 +1578,7 @@ RegisterContextUnwind::SavedLocationForRegister( if (unwindplan_regloc.IsCFAPlusOffset()) { int offset = unwindplan_regloc.GetOffset(); - regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred; + regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred; regloc.location.inferred_value = m_cfa + offset; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d), value is CFA plus " @@ -1589,7 +1590,8 @@ RegisterContextUnwind::SavedLocationForRegister( if (unwindplan_regloc.IsAtCFAPlusOffset()) { int offset = unwindplan_regloc.GetOffset(); - regloc.type = UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation; + regloc.type = + UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation; regloc.location.target_memory_location = m_cfa + offset; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d) from the stack, saved at " @@ -1604,7 +1606,7 @@ RegisterContextUnwind::SavedLocationForRegister( return UnwindLLDB::RegisterSearchResult::eRegisterNotFound; int offset = unwindplan_regloc.GetOffset(); - regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred; + regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred; regloc.location.inferred_value = m_afa + offset; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d), value is AFA plus " @@ -1619,7 +1621,8 @@ RegisterContextUnwind::SavedLocationForRegister( return UnwindLLDB::RegisterSearchResult::eRegisterNotFound; int offset = unwindplan_regloc.GetOffset(); - regloc.type = UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation; + regloc.type = + UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation; regloc.location.target_memory_location = m_afa + offset; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d) from the stack, saved at " @@ -1639,7 +1642,7 @@ RegisterContextUnwind::SavedLocationForRegister( regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB)); return UnwindLLDB::RegisterSearchResult::eRegisterNotFound; } - regloc.type = UnwindLLDB::RegisterLocation::eRegisterInRegister; + regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister; regloc.location.register_number = row_regnum.GetAsKind(eRegisterKindLLDB); m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg( @@ -1670,7 +1673,8 @@ RegisterContextUnwind::SavedLocationForRegister( addr_t val; val = result->GetScalar().ULongLong(); if (unwindplan_regloc.IsDWARFExpression()) { - regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred; + regloc.type = + UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred; regloc.location.inferred_value = val; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d) via DWARF expression " @@ -1678,8 +1682,8 @@ RegisterContextUnwind::SavedLocationForRegister( regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB)); return UnwindLLDB::RegisterSearchResult::eRegisterFound; } else { - regloc.type = - UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation; + regloc.type = UnwindLLDB::ConcreteRegisterLocation:: + eRegisterSavedAtMemoryLocation; regloc.location.target_memory_location = val; m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d) via DWARF expression " @@ -1695,7 +1699,7 @@ RegisterContextUnwind::SavedLocationForRegister( } if (unwindplan_regloc.IsConstant()) { - regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred; + regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred; regloc.location.inferred_value = unwindplan_regloc.GetConstant(); m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc; UnwindLogMsg("supplying caller's register %s (%d) via constant value", @@ -1756,7 +1760,7 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() { addr_t old_caller_pc_value = LLDB_INVALID_ADDRESS; addr_t new_caller_pc_value = LLDB_INVALID_ADDRESS; - UnwindLLDB::RegisterLocation regloc = {}; + UnwindLLDB::ConcreteRegisterLocation regloc = {}; if (SavedLocationForRegister(pc_regnum.GetAsKind(eRegisterKindLLDB), regloc) == UnwindLLDB::RegisterSearchResult::eRegisterFound) { @@ -2188,7 +2192,7 @@ bool RegisterContextUnwind::ReadGPRValue(lldb::RegisterKind register_kind, generic_regnum == LLDB_REGNUM_GENERIC_RA)) pc_register = true; - lldb_private::UnwindLLDB::RegisterLocation regloc; + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc; if (!m_parent_unwind.SearchForSavedLocationForRegister( lldb_regnum, regloc, m_frame_number - 1, pc_register)) { return false; @@ -2235,7 +2239,7 @@ bool RegisterContextUnwind::ReadRegister(const RegisterInfo *reg_info, is_pc_regnum = true; } - lldb_private::UnwindLLDB::RegisterLocation regloc; + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc; // Find out where the NEXT frame saved THIS frame's register contents if (!m_parent_unwind.SearchForSavedLocationForRegister( lldb_regnum, regloc, m_frame_number - 1, is_pc_regnum)) @@ -2270,7 +2274,7 @@ bool RegisterContextUnwind::WriteRegister(const RegisterInfo *reg_info, return m_thread.GetRegisterContext()->WriteRegister(reg_info, value); } - lldb_private::UnwindLLDB::RegisterLocation regloc; + lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc; // Find out where the NEXT frame saved THIS frame's register contents if (!m_parent_unwind.SearchForSavedLocationForRegister( lldb_regnum, regloc, m_frame_number - 1, false)) diff --git a/lldb/source/Target/UnwindLLDB.cpp b/lldb/source/Target/UnwindLLDB.cpp index f43e940492b09b..4d3f23948b487c 100644 --- a/lldb/source/Target/UnwindLLDB.cpp +++ b/lldb/source/Target/UnwindLLDB.cpp @@ -474,7 +474,8 @@ UnwindLLDB::GetRegisterContextForFrameNum(uint32_t frame_num) { } bool UnwindLLDB::SearchForSavedLocationForRegister( - uint32_t lldb_regnum, lldb_private::UnwindLLDB::RegisterLocation ®loc, + uint32_t lldb_regnum, + lldb_private::UnwindLLDB::ConcreteRegisterLocation ®loc, uint32_t starting_frame_num, bool pc_reg) { int64_t frame_num = starting_frame_num; if (static_cast(frame_num) >= m_frames.size()) @@ -497,8 +498,8 @@ bool UnwindLLDB::SearchForSavedLocationForRegister( // We descended down to the live register context aka stack frame 0 and are // reading the value out of a live register. if (result == UnwindLLDB::RegisterSearchResult::eRegisterFound && - regloc.type == - UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext) { + regloc.type == UnwindLLDB::ConcreteRegisterLocation:: + eRegisterInLiveRegisterContext) { return true; } @@ -509,7 +510,8 @@ bool UnwindLLDB::SearchForSavedLocationForRegister( // down the stack, or an actual value from a live RegisterContext at frame // 0. if (result == UnwindLLDB::RegisterSearchResult::eRegisterFound && - regloc.type == UnwindLLDB::RegisterLocation::eRegisterInRegister && + regloc.type == + UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister && frame_num > 0) { result = UnwindLLDB::RegisterSearchResult::eRegisterNotFound; lldb_regnum = regloc.location.register_number; diff --git a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp index 9303d6f5f3c6e7..12eb577e817e2c 100644 --- a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp +++ b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp @@ -62,7 +62,7 @@ TEST_F(TestArm64InstEmulation, TestSimpleDarwinFunction) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // 'int main() { }' compiled for arm64-apple-ios with clang uint8_t data[] = { @@ -170,7 +170,7 @@ TEST_F(TestArm64InstEmulation, TestMediumDarwinFunction) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // disassembly of -[NSPlaceholderString initWithBytes:length:encoding:] // from Foundation for iOS. @@ -332,7 +332,7 @@ TEST_F(TestArm64InstEmulation, TestFramelessThreeEpilogueFunction) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // disassembly of JSC::ARM64LogicalImmediate::findBitRange<16u> // from JavaScriptcore for iOS. @@ -431,7 +431,7 @@ TEST_F(TestArm64InstEmulation, TestRegisterSavedTwice) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // disassembly of mach_msg_sever_once from libsystem_kernel.dylib for iOS. uint8_t data[] = { @@ -533,7 +533,7 @@ TEST_F(TestArm64InstEmulation, TestRegisterDoubleSpills) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // this file built with clang for iOS arch arm64 optimization -Os // #include @@ -705,7 +705,7 @@ TEST_F(TestArm64InstEmulation, TestCFARegisterTrackedAcrossJumps) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; uint8_t data[] = { // prologue @@ -804,7 +804,7 @@ TEST_F(TestArm64InstEmulation, TestCFAResetToSP) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // The called_from_nodebug() from TestStepNoDebug.py // Most of the previous unit tests have $sp being set as diff --git a/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp index 9892e18d99c25f..a85aad7e29f94a 100644 --- a/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp +++ b/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp @@ -61,7 +61,7 @@ TEST_F(TestPPC64InstEmulation, TestSimpleFunction) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // prologue and epilogue of: // int main() { @@ -180,7 +180,7 @@ TEST_F(TestPPC64InstEmulation, TestMediumFunction) { UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // prologue and epilogue of main() (call-func.c), // with several calls and stack variables. diff --git a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp index 597e5b2e40d5e0..3ff57b4f97f178 100644 --- a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp +++ b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp @@ -168,7 +168,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimple64bitFrameFunction) { eLazyBoolYes); EXPECT_TRUE(unwind_plan.GetSourcedFromCompiler() == eLazyBoolNo); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // 0: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8] UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0); @@ -244,7 +244,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimple32bitFrameFunction) { eLazyBoolYes); EXPECT_TRUE(unwind_plan.GetSourcedFromCompiler() == eLazyBoolNo); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // offset 0 -- pushl %ebp UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0); @@ -381,7 +381,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessBigStackFrame) { // 33: CFA=rsp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8] // 34: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8] - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // grab the Row for when the prologue has finished executing: // 17: CFA=rsp+14496 => rbx=[CFA-56] rbp=[CFA-16] rsp=CFA+0 r12=[CFA-48] @@ -650,7 +650,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessBigStackFrame) { // 48: CFA=esp+14480 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8] // esp=CFA+0 eip=[CFA-4] - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; // Check that we get the CFA correct for the pic base setup sequence @@ -802,7 +802,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessSmallStackFrame) { // 1: CFA=rsp+16 => rsp=CFA+0 rip=[CFA-8] // 22: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8] - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // grab the Row for when the prologue has finished executing: // 1: CFA=rsp+16 => rsp=CFA+0 rip=[CFA-8] @@ -911,7 +911,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessSmallStackFrame) { // row[3]: 9: CFA=esp+16 => esp=CFA+0 eip=[CFA-4] // row[4]: 34: CFA=esp +4 => esp=CFA+0 eip=[CFA-4] - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // Check unwind state before we set up the picbase register // 3: CFA=esp+16 => esp=CFA+0 eip=[CFA-4] @@ -962,7 +962,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessSmallStackFrame) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushRBP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1005,7 +1005,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBP) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushImm) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1054,7 +1054,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushImm) { // in the first function called in a new thread and it needs to // put a 0 as the saved pc. We pretend it didn't change the CFA. TEST_F(Testx86AssemblyInspectionEngine, TestPush0) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1085,7 +1085,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPush0) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushExtended) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1133,7 +1133,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushExtended) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushR15) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1161,7 +1161,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR15) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushR14) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1189,7 +1189,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR14) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushR13) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1217,7 +1217,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR13) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushR12) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1245,7 +1245,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR12) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushRBX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1276,7 +1276,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBX) { // eax, ecx, edx are all considered volatile and push/pops of them are // not tracked (except to keep track of stack pointer movement) TEST_F(Testx86AssemblyInspectionEngine, TestPushEAX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1305,7 +1305,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEAX) { // eax, ecx, edx are all considered volatile and push/pops of them are // not tracked (except to keep track of stack pointer movement) TEST_F(Testx86AssemblyInspectionEngine, TestPushECX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1334,7 +1334,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushECX) { // eax, ecx, edx are all considered volatile and push/pops of them are // not tracked (except to keep track of stack pointer movement) TEST_F(Testx86AssemblyInspectionEngine, TestPushEDX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1360,7 +1360,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEDX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushEBX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1388,7 +1388,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEBX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushEBP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1416,7 +1416,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEBP) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushRBPWithREX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data[] = { @@ -1444,7 +1444,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBPWithREX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushESI) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1472,7 +1472,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushESI) { } TEST_F(Testx86AssemblyInspectionEngine, TestPushEDI) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1500,7 +1500,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEDI) { } TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; uint8_t data64_1[] = { @@ -1572,7 +1572,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) { } TEST_F(Testx86AssemblyInspectionEngine, TestSubRSP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1612,7 +1612,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubRSP) { } TEST_F(Testx86AssemblyInspectionEngine, TestSubESP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1652,7 +1652,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubESP) { } TEST_F(Testx86AssemblyInspectionEngine, TestAddRSP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1692,7 +1692,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddRSP) { } TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1732,7 +1732,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) { } TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1756,7 +1756,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1782,7 +1782,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopRBP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1808,7 +1808,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBP) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopR12) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1834,7 +1834,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR12) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopR13) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1860,7 +1860,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR13) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopR14) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1886,7 +1886,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR14) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopR15) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1912,7 +1912,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR15) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopEBX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1938,7 +1938,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEBX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopEBP) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1964,7 +1964,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEBP) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopRBPWithREX) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -1990,7 +1990,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBPWithREX) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopESI) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2016,7 +2016,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopESI) { } TEST_F(Testx86AssemblyInspectionEngine, TestPopEDI) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2044,7 +2044,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEDI) { // We don't track these registers, but make sure the CFA address is updated // if we're defining the CFA in term of esp. TEST_F(Testx86AssemblyInspectionEngine, Testi386IgnoredRegisters) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2082,7 +2082,7 @@ TEST_F(Testx86AssemblyInspectionEngine, Testi386IgnoredRegisters) { } TEST_F(Testx86AssemblyInspectionEngine, TestLEAVE) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2123,7 +2123,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestLEAVE) { // pushes the addr of the next insn on the stack, and then pop that value // into a register (the "pic base" register). TEST_F(Testx86AssemblyInspectionEngine, TestCALLNextInsn) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2148,7 +2148,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestCALLNextInsn) { } TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVx86_64) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2187,7 +2187,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVx86_64) { } TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVi386) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2221,7 +2221,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVi386) { } TEST_F(Testx86AssemblyInspectionEngine, TestSpArithx86_64Augmented) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2312,7 +2312,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpArithx86_64Augmented) { } TEST_F(Testx86AssemblyInspectionEngine, TestSimplex86_64Augmented) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2390,7 +2390,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplex86_64Augmented) { } TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2472,7 +2472,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) { // stops // disassembling at that point (long-mode). TEST_F(Testx86AssemblyInspectionEngine, Test32BitOnlyInstruction) { - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; UnwindPlan::RowSP row_sp; AddressRange sample_range; UnwindPlan unwind_plan(eRegisterKindLLDB); @@ -2634,7 +2634,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestStackRealignMSVC_i386) { EXPECT_EQ(esp_minus_4, plan.GetRowForFunctionOffset(30)->GetAFAValue()); // Test saved register - UnwindPlan::Row::RegisterLocation reg_loc; + UnwindPlan::Row::AbstractRegisterLocation reg_loc; EXPECT_TRUE( plan.GetRowForFunctionOffset(27)->GetRegisterInfo(k_edi, reg_loc)); EXPECT_TRUE(reg_loc.IsAtAFAPlusOffset()); @@ -2712,7 +2712,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestReturnDetect) { eLazyBoolYes); EXPECT_TRUE(unwind_plan.GetSourcedFromCompiler() == eLazyBoolNo); - UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::Row::AbstractRegisterLocation regloc; // 0: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8] UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0); diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index e2a6480c153aeb..64c1b5df6d582f 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -56,6 +56,13 @@ Changes to the LLVM IR * Added ``usub_cond`` and ``usub_sat`` operations to ``atomicrmw``. +* Remove the following intrinsics which can be replaced with a ``bitcast``: + + * ``llvm.nvvm.bitcast.f2i`` + * ``llvm.nvvm.bitcast.i2f`` + * ``llvm.nvvm.bitcast.d2ll`` + * ``llvm.nvvm.bitcast.ll2d`` + Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index b2cc5bc20ec42a..4ec672a6cfc33a 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -4532,6 +4532,9 @@ LLVMValueRef LLVMBuildStructGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name); LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str, const char *Name); +/** + * Deprecated: Use LLVMBuildGlobalString instead, which has identical behavior. + */ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str, const char *Name); LLVMBool LLVMGetVolatile(LLVMValueRef MemoryAccessInst); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 1729d606f6cc5a..76d51ab819f441 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -600,6 +600,9 @@ class CombinerHelper { bool matchRotateOutOfRange(MachineInstr &MI); void applyRotateOutOfRange(MachineInstr &MI); + bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo); + void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo); + /// \returns true if a G_ICMP instruction \p MI can be replaced with a true /// or false constant based off of KnownBits information. bool matchICmpToTrueFalseKnownBits(MachineInstr &MI, int64_t &MatchInfo); diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 8f83dede4a0cbd..23fd8350a29b3d 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2015,6 +2015,7 @@ class IRBuilderBase { /// /// If no module is given via \p M, it is take from the insertion point basic /// block. + LLVM_DEPRECATED("Use CreateGlobalString instead", "CreateGlobalString") Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "", unsigned AddressSpace = 0, Module *M = nullptr, bool AddNull = true) { diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 39685c920d948d..737dd6092e2183 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -30,6 +30,10 @@ // * llvm.nvvm.max.ui --> select(x ule y, x, y) // * llvm.nvvm.max.ull --> ibid. // * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32 +// * llvm.nvvm.bitcast.f2i --> bitcast +// * llvm.nvvm.bitcast.i2f --> ibid. +// * llvm.nvvm.bitcast.d2ll --> ibid. +// * llvm.nvvm.bitcast.ll2d --> ibid. def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1339,20 +1343,6 @@ let TargetPrefix = "nvvm" in { def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">, Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; -// -// Bitcast -// - - def int_nvvm_bitcast_f2i : ClangBuiltin<"__nvvm_bitcast_f2i">, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_bitcast_i2f : ClangBuiltin<"__nvvm_bitcast_i2f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_bitcast_ll2d : ClangBuiltin<"__nvvm_bitcast_ll2d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_bitcast_d2ll : ClangBuiltin<"__nvvm_bitcast_d2ll">, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; - // FNS def int_nvvm_fns : ClangBuiltin<"__nvvm_fns">, diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 3f8946a7ae967e..4e751a75196ce1 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -135,6 +135,7 @@ class GlobalVariable; class GlobalAlias; class NoCFIValue; class ConstantPtrAuth; +class ConstantExpr; class Context; class Function; class Instruction; @@ -344,6 +345,7 @@ class Value { friend class GlobalAlias; // For `Val`. friend class NoCFIValue; // For `Val`. friend class ConstantPtrAuth; // For `Val`. + friend class ConstantExpr; // For `Val`. /// All values point to the context. Context &Ctx; @@ -1661,6 +1663,19 @@ class ConstantPtrAuth final : public Constant { } }; +class ConstantExpr : public Constant { + ConstantExpr(llvm::ConstantExpr *C, Context &Ctx) + : Constant(ClassID::ConstantExpr, C, Ctx) {} + friend class Context; // For constructor. + +public: + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::ConstantExpr; + } + // TODO: Missing functions. +}; + class BlockAddress final : public Constant { BlockAddress(llvm::BlockAddress *C, Context &Ctx) : Constant(ClassID::BlockAddress, C, Ctx) {} diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 3367c7d7794186..2a9ca6d3d73ce6 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -40,6 +40,7 @@ DEF_CONST(GlobalAlias, GlobalAlias) DEF_CONST(BlockAddress, BlockAddress) DEF_CONST(NoCFIValue, NoCFIValue) DEF_CONST(ConstantPtrAuth, ConstantPtrAuth) +DEF_CONST(ConstantExpr, ConstantExpr) DEF_CONST(DSOLocalEquivalent, DSOLocalEquivalent) DEF_CONST(ConstantTokenNone, ConstantTokenNone) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index a2ac9e014b44ab..f99f80967797c1 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -56,6 +56,7 @@ class Type { friend class ConstantVector; // For LLVMTy. friend class CmpInst; // For LLVMTy. TODO: Cleanup after // sandboxir::VectorType is more complete. + friend class Utils; // for LLVMTy // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; diff --git a/llvm/include/llvm/SandboxIR/Utils.h b/llvm/include/llvm/SandboxIR/Utils.h new file mode 100644 index 00000000000000..ccc0030868a55b --- /dev/null +++ b/llvm/include/llvm/SandboxIR/Utils.h @@ -0,0 +1,54 @@ +//===- Utils.h --------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Collector for SandboxIR related convenience functions that don't belong in +// other classes. + +#ifndef LLVM_SANDBOXIR_UTILS_H +#define LLVM_SANDBOXIR_UTILS_H + +namespace llvm::sandboxir { + +class Utils { +public: + /// \Returns the expected type of \p Value V. For most Values this is + /// equivalent to getType, but for stores returns the stored type, rather + /// than void, and for ReturnInsts returns the returned type. + static Type *getExpectedType(const Value *V) { + if (auto *I = dyn_cast(V)) { + // A Return's value operand can be null if it returns void. + if (auto *RI = dyn_cast(I)) { + if (RI->getReturnValue() == nullptr) + return RI->getType(); + } + return getExpectedValue(I)->getType(); + } + return V->getType(); + } + + /// \Returns the expected Value for this instruction. For most instructions, + /// this is the instruction itself, but for stores returns the stored + /// operand, and for ReturnInstructions returns the returned value. + static Value *getExpectedValue(const Instruction *I) { + if (auto *SI = dyn_cast(I)) + return SI->getValueOperand(); + if (auto *RI = dyn_cast(I)) + return RI->getReturnValue(); + return const_cast(I); + } + + /// \Returns the number of bits required to represent the operands or return + /// value of \p V in \p DL. + static unsigned getNumBits(Value *V, const DataLayout &DL) { + Type *Ty = getExpectedType(V); + return DL.getTypeSizeInBits(Ty->LLVMTy); + } +}; +} // namespace llvm::sandboxir + +#endif // LLVM_SANDBOXIR_UTILS_H diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index be33c77adffb51..f838c6e62a2ce3 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1505,6 +1505,13 @@ def insert_vector_elt_oob : GICombineRule< [{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +// Combine v8i8 (buildvector i8 (trunc(unmerge)), i8 (trunc), i8 (trunc), i8 (trunc), undef, undef, undef, undef) +def combine_use_vector_truncate : GICombineRule< + (defs root:$root, register_matchinfo:$matchinfo), + (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root, + [{ return Helper.matchUseVectorTruncate(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyUseVectorTruncate(*${root}, ${matchinfo}); }])>; + def add_of_vscale : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (G_VSCALE $left, $imm1), @@ -1912,7 +1919,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, combine_concat_vector, match_addos, - sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat]>; + sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat, + combine_use_vector_truncate]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 091061442ae120..f5b91919a96927 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -377,19 +377,19 @@ class LoopVectorizationLegality { return LAI->getDepChecker().getMaxSafeVectorWidthInBits(); } - /// Returns true if the loop has a speculative early exit, i.e. an + /// Returns true if the loop has an uncountable early exit, i.e. an /// uncountable exit that isn't the latch block. - bool hasSpeculativeEarlyExit() const { return HasSpeculativeEarlyExit; } + bool hasUncountableEarlyExit() const { return HasUncountableEarlyExit; } - /// Returns the speculative early exiting block. - BasicBlock *getSpeculativeEarlyExitingBlock() const { + /// Returns the uncountable early exiting block. + BasicBlock *getUncountableEarlyExitingBlock() const { assert(getUncountableExitingBlocks().size() == 1 && "Expected only a single uncountable exiting block"); return getUncountableExitingBlocks()[0]; } - /// Returns the destination of a speculative early exiting block. - BasicBlock *getSpeculativeEarlyExitBlock() const { + /// Returns the destination of an uncountable early exiting block. + BasicBlock *getUncountableEarlyExitBlock() const { assert(getUncountableExitBlocks().size() == 1 && "Expected only a single uncountable exit block"); return getUncountableExitBlocks()[0]; @@ -603,15 +603,17 @@ class LoopVectorizationLegality { /// the use of those function variants. bool VecCallVariantsFound = false; - /// Indicates whether this loop has a speculative early exit, i.e. an + /// Indicates whether this loop has an uncountable early exit, i.e. an /// uncountable exiting block that is not the latch. - bool HasSpeculativeEarlyExit = false; + bool HasUncountableEarlyExit = false; - /// Keep track of all the loop exiting blocks. + /// Keep track of all the countable and uncountable exiting blocks if + /// the exact backedge taken count is not computable. SmallVector CountableExitingBlocks; SmallVector UncountableExitingBlocks; - /// Keep track of the destinations of all uncountable exits. + /// Keep track of the destinations of all uncountable exits if the + /// exact backedge taken count is not computable. SmallVector UncountableExitBlocks; }; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 980f142f113265..3f189724763d47 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2449,13 +2449,20 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, continue; // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. + // but is not a load, we only allow it if it's a call to a function with a + // vector mapping and no pointer arguments. if (I.mayReadFromMemory()) { - // If the function has an explicit vectorized counterpart, we can safely - // assume that it can be vectorized. + auto hasPointerArgs = [](CallBase *CB) { + return any_of(CB->args(), [](Value const *Arg) { + return Arg->getType()->isPointerTy(); + }); + }; + + // If the function has an explicit vectorized counterpart, and does not + // take output/input pointers, we can safely assume that it can be + // vectorized. if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && - !VFDatabase::getMappings(*Call).empty()) + !hasPointerArgs(Call) && !VFDatabase::getMappings(*Call).empty()) continue; auto *Ld = dyn_cast(&I); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index df9c12bc9c97bd..c279289f9161bf 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -3320,6 +3320,112 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits, isConstTrueVal(TLI, Cst, IsVector, IsFP); } +// This combine tries to reduce the number of scalarised G_TRUNC instructions by +// using vector truncates instead +// +// EXAMPLE: +// %a(i32), %b(i32) = G_UNMERGE_VALUES %src(<2 x i32>) +// %T_a(i16) = G_TRUNC %a(i32) +// %T_b(i16) = G_TRUNC %b(i32) +// %Undef(i16) = G_IMPLICIT_DEF(i16) +// %dst(v4i16) = G_BUILD_VECTORS %T_a(i16), %T_b(i16), %Undef(i16), %Undef(i16) +// +// ===> +// %Undef(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>) +// %Mid(<4 x s32>) = G_CONCAT_VECTORS %src(<2 x i32>), %Undef(<2 x i32>) +// %dst(<4 x s16>) = G_TRUNC %Mid(<4 x s32>) +// +// Only matches sources made up of G_TRUNCs followed by G_IMPLICIT_DEFs +bool CombinerHelper::matchUseVectorTruncate(MachineInstr &MI, + Register &MatchInfo) { + auto BuildMI = cast(&MI); + unsigned NumOperands = BuildMI->getNumSources(); + LLT DstTy = MRI.getType(BuildMI->getReg(0)); + + // Check the G_BUILD_VECTOR sources + unsigned I; + MachineInstr *UnmergeMI = nullptr; + + // Check all source TRUNCs come from the same UNMERGE instruction + for (I = 0; I < NumOperands; ++I) { + auto SrcMI = MRI.getVRegDef(BuildMI->getSourceReg(I)); + auto SrcMIOpc = SrcMI->getOpcode(); + + // Check if the G_TRUNC instructions all come from the same MI + if (SrcMIOpc == TargetOpcode::G_TRUNC) { + if (!UnmergeMI) { + UnmergeMI = MRI.getVRegDef(SrcMI->getOperand(1).getReg()); + if (UnmergeMI->getOpcode() != TargetOpcode::G_UNMERGE_VALUES) + return false; + } else { + auto UnmergeSrcMI = MRI.getVRegDef(SrcMI->getOperand(1).getReg()); + if (UnmergeMI != UnmergeSrcMI) + return false; + } + } else { + break; + } + } + if (I < 2) + return false; + + // Check the remaining source elements are only G_IMPLICIT_DEF + for (; I < NumOperands; ++I) { + auto SrcMI = MRI.getVRegDef(BuildMI->getSourceReg(I)); + auto SrcMIOpc = SrcMI->getOpcode(); + + if (SrcMIOpc != TargetOpcode::G_IMPLICIT_DEF) + return false; + } + + // Check the size of unmerge source + MatchInfo = cast(UnmergeMI)->getSourceReg(); + LLT UnmergeSrcTy = MRI.getType(MatchInfo); + if (!DstTy.getElementCount().isKnownMultipleOf(UnmergeSrcTy.getNumElements())) + return false; + + // Only generate legal instructions post-legalizer + if (!IsPreLegalize) { + LLT MidTy = DstTy.changeElementType(UnmergeSrcTy.getScalarType()); + + if (DstTy.getElementCount() != UnmergeSrcTy.getElementCount() && + !isLegal({TargetOpcode::G_CONCAT_VECTORS, {MidTy, UnmergeSrcTy}})) + return false; + + if (!isLegal({TargetOpcode::G_TRUNC, {DstTy, MidTy}})) + return false; + } + + return true; +} + +void CombinerHelper::applyUseVectorTruncate(MachineInstr &MI, + Register &MatchInfo) { + Register MidReg; + auto BuildMI = cast(&MI); + Register DstReg = BuildMI->getReg(0); + LLT DstTy = MRI.getType(DstReg); + LLT UnmergeSrcTy = MRI.getType(MatchInfo); + unsigned DstTyNumElt = DstTy.getNumElements(); + unsigned UnmergeSrcTyNumElt = UnmergeSrcTy.getNumElements(); + + // No need to pad vector if only G_TRUNC is needed + if (DstTyNumElt / UnmergeSrcTyNumElt == 1) { + MidReg = MatchInfo; + } else { + Register UndefReg = Builder.buildUndef(UnmergeSrcTy).getReg(0); + SmallVector ConcatRegs = {MatchInfo}; + for (unsigned I = 1; I < DstTyNumElt / UnmergeSrcTyNumElt; ++I) + ConcatRegs.push_back(UndefReg); + + auto MidTy = DstTy.changeElementType(UnmergeSrcTy.getScalarType()); + MidReg = Builder.buildConcatVectors(MidTy, ConcatRegs).getReg(0); + } + + Builder.buildTrunc(DstReg, MidReg); + MI.eraseFromParent(); +} + bool CombinerHelper::matchNotCmp(MachineInstr &MI, SmallVectorImpl &RegsToNegate) { assert(MI.getOpcode() == TargetOpcode::G_XOR); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b36a1245f83962..c6f6fc25080541 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -508,6 +508,7 @@ namespace { SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitFPOW(SDNode *N); + SDValue visitFCANONICALIZE(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); @@ -1980,6 +1981,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FREEZE: return visitFREEZE(N); case ISD::GET_FPENV_MEM: return visitGET_FPENV_MEM(N); case ISD::SET_FPENV_MEM: return visitSET_FPENV_MEM(N); + case ISD::FCANONICALIZE: return visitFCANONICALIZE(N); case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_ADD: @@ -2090,6 +2092,19 @@ static SDValue getInputChainForNode(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFCANONICALIZE(SDNode *N) { + SDValue Operand = N->getOperand(0); + EVT VT = Operand.getValueType(); + SDLoc dl(N); + + // Canonicalize undef to quiet NaN. + if (Operand.isUndef()) { + APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics()); + return DAG.getConstantFP(CanonicalQNaN, dl, VT); + } + return SDValue(); +} + SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // If N has two operands, where one has an input chain equal to the other, // the 'other' chain is redundant. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 482f88e5c86de7..1c466ed0b77997 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5857,11 +5857,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT); SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(), LdVT.getVectorElementCount()); - const auto *MMO = LD->getMemOperand(); SDValue NewLoad = - DAG.getLoadVP(WideVT, DL, LD->getChain(), LD->getBasePtr(), Mask, EVL, - MMO->getPointerInfo(), MMO->getAlign(), MMO->getFlags(), - MMO->getAAInfo()); + DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL, + LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask, + EVL, LD->getMemoryVT(), LD->getMemOperand()); // Modified the chain - switch anything that used the old chain to use // the new one. diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index fca822a485cafe..1f23838b2de0ca 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -705,7 +705,7 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) { StackChkFail = M->getOrInsertFunction("__stack_smash_handler", Type::getVoidTy(Context), PointerType::getUnqual(Context)); - Args.push_back(B.CreateGlobalStringPtr(F->getName(), "SSH")); + Args.push_back(B.CreateGlobalString(F->getName(), "SSH")); } else { StackChkFail = M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp index 3030aa2c39b234..9a115ace6ce3a3 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp @@ -287,12 +287,12 @@ uint32_t TypeUnit::addFileNameIntoLinetable(StringEntry *Dir, } uint32_t FileIdx = 0; - FilenamesMapTy::iterator FileEntry = FileNamesMap.find({FileName, DirIdx}); - if (FileEntry == FileNamesMap.end()) { + auto [FileEntry, Inserted] = FileNamesMap.try_emplace({FileName, DirIdx}); + if (Inserted) { // We currently do not support more than UINT32_MAX files. assert(LineTable.Prologue.FileNames.size() < UINT32_MAX); FileIdx = LineTable.Prologue.FileNames.size(); - FileNamesMap.insert({{FileName, DirIdx}, FileIdx}); + FileEntry->second = FileIdx; LineTable.Prologue.FileNames.push_back(DWARFDebugLine::FileNameEntry()); LineTable.Prologue.FileNames.back().Name = DWARFFormValue::createFromPValue( dwarf::DW_FORM_string, FileName->getKeyData()); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7c63db8dc2bb36..7c893796a97462 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -907,8 +907,8 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, GV.getInitializer() == Initializer) return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr); - SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "", - /* AddressSpace */ 0, &M); + SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "", + /* AddressSpace */ 0, &M); } return SrcLocStr; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index aabb617a59b589..01a064dea53b02 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1268,6 +1268,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("atomic.load.add.")) // nvvm.atomic.load.add.{f32.p,f64.p} Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p"); + else if (Name.consume_front("bitcast.")) + // nvvm.bitcast.{f2i,i2f,ll2d,d2ll} + Expand = + Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll"; else Expand = false; @@ -4258,6 +4262,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { F->getParent(), Intrinsic::convert_from_fp16, {Builder.getFloatTy()}), CI->getArgOperand(0), "h2f"); + } else if (Name.consume_front("bitcast.") && + (Name == "f2i" || Name == "i2f" || Name == "ll2d" || + Name == "d2ll")) { + Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType()); } else { Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); if (IID != Intrinsic::not_intrinsic && diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index dbb0578dd70fd5..90d3b51aeef92d 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -4069,7 +4069,7 @@ LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str, LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str, const char *Name) { - return wrap(unwrap(B)->CreateGlobalStringPtr(Str, Name)); + return wrap(unwrap(B)->CreateGlobalString(Str, Name)); } LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) { diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index b96141f7c60f0b..b1ed72be1030d6 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2888,6 +2888,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr( new ConstantPtrAuth(cast(C), *this)); break; + case llvm::Value::ConstantExprVal: + It->second = std::unique_ptr( + new ConstantExpr(cast(C), *this)); + break; default: It->second = std::unique_ptr(new Constant(C, *this)); break; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 25989fb5988954..ead6455ddd5278 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -330,5 +330,5 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt]> { + combine_mul_cmlt, combine_use_vector_truncate]> { } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 4f6131fd835577..6a4b94a216832e 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -136,8 +136,8 @@ class AArch64AsmParser : public MCTargetAsmParser { assert(Predicated); return ElementSize; } - unsigned getDstReg() const { return Dst; } - unsigned getPgReg() const { + MCRegister getDstReg() const { return Dst; } + MCRegister getPgReg() const { assert(Predicated); return Pg; } @@ -146,8 +146,8 @@ class AArch64AsmParser : public MCTargetAsmParser { bool Active = false; bool Predicated = false; unsigned ElementSize; - unsigned Dst; - unsigned Pg; + MCRegister Dst; + MCRegister Pg; } NextPrefix; AArch64TargetStreamer &getTargetStreamer() { @@ -5234,7 +5234,7 @@ bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info, return false; } -static inline bool isMatchingOrAlias(unsigned ZReg, unsigned Reg) { +static inline bool isMatchingOrAlias(MCRegister ZReg, MCRegister Reg) { assert((ZReg >= AArch64::Z0) && (ZReg <= AArch64::Z31)); return (ZReg == ((Reg - AArch64::B0) + AArch64::Z0)) || (ZReg == ((Reg - AArch64::H0) + AArch64::Z0)) || @@ -5322,7 +5322,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, if (IsWindowsArm64EC) { for (unsigned i = 0; i < Inst.getNumOperands(); ++i) { if (Inst.getOperand(i).isReg()) { - unsigned Reg = Inst.getOperand(i).getReg(); + MCRegister Reg = Inst.getOperand(i).getReg(); // At this point, vector registers are matched to their // appropriately sized alias. if ((Reg == AArch64::W13 || Reg == AArch64::X13) || @@ -5351,9 +5351,9 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::LDPWpre: case AArch64::LDPXpost: case AArch64::LDPXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - unsigned Rn = Inst.getOperand(3).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rt2 = Inst.getOperand(2).getReg(); + MCRegister Rn = Inst.getOperand(3).getReg(); if (RI->isSubRegisterEq(Rn, Rt)) return Error(Loc[0], "unpredictable LDP instruction, writeback base " "is also a destination"); @@ -5376,8 +5376,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::LDPSWi: case AArch64::LDPWi: case AArch64::LDPXi: { - unsigned Rt = Inst.getOperand(0).getReg(); - unsigned Rt2 = Inst.getOperand(1).getReg(); + MCRegister Rt = Inst.getOperand(0).getReg(); + MCRegister Rt2 = Inst.getOperand(1).getReg(); if (Rt == Rt2) return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); break; @@ -5389,8 +5389,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::LDPSpost: case AArch64::LDPSpre: case AArch64::LDPSWpost: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rt2 = Inst.getOperand(2).getReg(); if (Rt == Rt2) return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); break; @@ -5405,9 +5405,9 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::STPWpre: case AArch64::STPXpost: case AArch64::STPXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - unsigned Rn = Inst.getOperand(3).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rt2 = Inst.getOperand(2).getReg(); + MCRegister Rn = Inst.getOperand(3).getReg(); if (RI->isSubRegisterEq(Rn, Rt)) return Error(Loc[0], "unpredictable STP instruction, writeback base " "is also a source"); @@ -5438,8 +5438,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::LDRSWpost: case AArch64::LDRWpost: case AArch64::LDRXpost: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rn = Inst.getOperand(2).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rn = Inst.getOperand(2).getReg(); if (RI->isSubRegisterEq(Rn, Rt)) return Error(Loc[0], "unpredictable LDR instruction, writeback base " "is also a source"); @@ -5457,8 +5457,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::STRHpre: case AArch64::STRWpre: case AArch64::STRXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rn = Inst.getOperand(2).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rn = Inst.getOperand(2).getReg(); if (RI->isSubRegisterEq(Rn, Rt)) return Error(Loc[0], "unpredictable STR instruction, writeback base " "is also a source"); @@ -5472,9 +5472,9 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::STLXRH: case AArch64::STLXRW: case AArch64::STLXRX: { - unsigned Rs = Inst.getOperand(0).getReg(); - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rn = Inst.getOperand(2).getReg(); + MCRegister Rs = Inst.getOperand(0).getReg(); + MCRegister Rt = Inst.getOperand(1).getReg(); + MCRegister Rn = Inst.getOperand(2).getReg(); if (RI->isSubRegisterEq(Rt, Rs) || (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP)) return Error(Loc[0], @@ -5485,10 +5485,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::STXPX: case AArch64::STLXPW: case AArch64::STLXPX: { - unsigned Rs = Inst.getOperand(0).getReg(); - unsigned Rt1 = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - unsigned Rn = Inst.getOperand(3).getReg(); + MCRegister Rs = Inst.getOperand(0).getReg(); + MCRegister Rt1 = Inst.getOperand(1).getReg(); + MCRegister Rt2 = Inst.getOperand(2).getReg(); + MCRegister Rn = Inst.getOperand(3).getReg(); if (RI->isSubRegisterEq(Rt1, Rs) || RI->isSubRegisterEq(Rt2, Rs) || (RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP)) return Error(Loc[0], @@ -5497,8 +5497,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, } case AArch64::LDRABwriteback: case AArch64::LDRAAwriteback: { - unsigned Xt = Inst.getOperand(0).getReg(); - unsigned Xn = Inst.getOperand(1).getReg(); + MCRegister Xt = Inst.getOperand(0).getReg(); + MCRegister Xn = Inst.getOperand(1).getReg(); if (Xt == Xn) return Error(Loc[0], "unpredictable LDRA instruction, writeback base" @@ -5605,12 +5605,12 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::CPYETWN: case AArch64::CPYETRN: case AArch64::CPYETN: { - unsigned Xd_wb = Inst.getOperand(0).getReg(); - unsigned Xs_wb = Inst.getOperand(1).getReg(); - unsigned Xn_wb = Inst.getOperand(2).getReg(); - unsigned Xd = Inst.getOperand(3).getReg(); - unsigned Xs = Inst.getOperand(4).getReg(); - unsigned Xn = Inst.getOperand(5).getReg(); + MCRegister Xd_wb = Inst.getOperand(0).getReg(); + MCRegister Xs_wb = Inst.getOperand(1).getReg(); + MCRegister Xn_wb = Inst.getOperand(2).getReg(); + MCRegister Xd = Inst.getOperand(3).getReg(); + MCRegister Xs = Inst.getOperand(4).getReg(); + MCRegister Xn = Inst.getOperand(5).getReg(); if (Xd_wb != Xd) return Error(Loc[0], "invalid CPY instruction, Xd_wb and Xd do not match"); @@ -5655,11 +5655,11 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, case AArch64::MOPSSETGET: case AArch64::MOPSSETGEN: case AArch64::MOPSSETGETN: { - unsigned Xd_wb = Inst.getOperand(0).getReg(); - unsigned Xn_wb = Inst.getOperand(1).getReg(); - unsigned Xd = Inst.getOperand(2).getReg(); - unsigned Xn = Inst.getOperand(3).getReg(); - unsigned Xm = Inst.getOperand(4).getReg(); + MCRegister Xd_wb = Inst.getOperand(0).getReg(); + MCRegister Xn_wb = Inst.getOperand(1).getReg(); + MCRegister Xd = Inst.getOperand(2).getReg(); + MCRegister Xn = Inst.getOperand(3).getReg(); + MCRegister Xm = Inst.getOperand(4).getReg(); if (Xd_wb != Xd) return Error(Loc[0], "invalid SET instruction, Xd_wb and Xd do not match"); @@ -6451,7 +6451,7 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // GPR64. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[2]); if (Op.isScalarReg()) { - unsigned Reg = getXRegFromWReg(Op.getReg()); + MCRegister Reg = getXRegFromWReg(Op.getReg()); Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), Op.getEndLoc(), getContext()); @@ -6467,7 +6467,7 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // GPR64. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[2]); if (Op.isScalarReg()) { - unsigned Reg = getXRegFromWReg(Op.getReg()); + MCRegister Reg = getXRegFromWReg(Op.getReg()); Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), Op.getEndLoc(), getContext()); @@ -6484,7 +6484,7 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // GPR32. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[1]); if (Op.isScalarReg()) { - unsigned Reg = getWRegFromXReg(Op.getReg()); + MCRegister Reg = getWRegFromXReg(Op.getReg()); Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), Op.getEndLoc(), getContext()); @@ -7907,7 +7907,7 @@ ParseStatus AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { return Error(E, "expected second odd register of a consecutive same-size " "even/odd register pair"); - unsigned Pair = 0; + MCRegister Pair; if (isXReg) { Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64, &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]); @@ -8047,7 +8047,7 @@ ParseStatus AArch64AsmParser::tryParseGPR64x8(OperandVector &Operands) { MCContext &ctx = getContext(); const MCRegisterInfo *RI = ctx.getRegisterInfo(); - int X8Reg = RI->getMatchingSuperReg( + MCRegister X8Reg = RI->getMatchingSuperReg( XReg, AArch64::x8sub_0, &AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID]); if (!X8Reg) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 8ed867fc4ad172..6cb181011f8f67 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -95,7 +95,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder( {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) .legalFor({p0, s8, s16, s32, s64}) - .legalFor(PackedVectorAllTypeList) + .legalFor({v16s8, v8s16, v4s32, v2s64, v2p0, v8s8, v4s16, v2s32, v4s8, + v2s16, v2s8}) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64) .moreElementsToNextPow2(0) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 924d64b66b2235..adc6c5bf4ed171 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -622,7 +622,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { return CU::UNWIND_ARM64_MODE_DWARF; case MCCFIInstruction::OpDefCfa: { // Defines a frame pointer. - unsigned XReg = + MCRegister XReg = getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true)); // Other CFA registers than FP are not supported by compact unwind. diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index c5de5b4de4aef3..7c9113f6bc2380 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -815,14 +815,14 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, uint64_t Address, O << '[' << MI->getOperand(OpNum++).getImm() << ']'; // Next the address: [xN] - unsigned AddrReg = MI->getOperand(OpNum++).getReg(); + MCRegister AddrReg = MI->getOperand(OpNum++).getReg(); O << ", ["; printRegName(O, AddrReg); O << ']'; // Finally, there might be a post-indexed offset. if (LdStDesc->NaturalOffset != 0) { - unsigned Reg = MI->getOperand(OpNum++).getReg(); + MCRegister Reg = MI->getOperand(OpNum++).getReg(); if (Reg != AArch64::XZR) { O << ", "; printRegName(O, Reg); @@ -860,7 +860,7 @@ bool AArch64InstPrinter::printRangePrefetchAlias(const MCInst *MI, if ((PRFOp & Mask) != Mask) return false; // Rt != '11xxx', it's a PRFM instruction. - unsigned Rm = MI->getOperand(2).getReg(); + MCRegister Rm = MI->getOperand(2).getReg(); // "Rm" must be a 64-bit GPR for RPRFM. if (MRI.getRegClass(AArch64::GPR32RegClassID).contains(Rm)) @@ -1143,8 +1143,7 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - unsigned Reg = Op.getReg(); - printRegName(O, Reg); + printRegName(O, Op.getReg()); } else if (Op.isImm()) { printImm(MI, OpNo, STI, O); } else { @@ -1184,7 +1183,7 @@ void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); if (Reg == AArch64::XZR) markup(O, Markup::Immediate) << "#" << Imm; else @@ -1198,8 +1197,7 @@ void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); assert(Op.isReg() && "Non-register vreg operand!"); - unsigned Reg = Op.getReg(); - printRegName(O, Reg, AArch64::vreg); + printRegName(O, Op.getReg(), AArch64::vreg); } void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, @@ -1280,8 +1278,8 @@ void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at // all. if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) { - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src1 = MI->getOperand(1).getReg(); + MCRegister Dest = MI->getOperand(0).getReg(); + MCRegister Src1 = MI->getOperand(1).getReg(); if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) && ExtType == AArch64_AM::UXTX) || ((Dest == AArch64::WSP || Src1 == AArch64::WSP) && @@ -1347,7 +1345,7 @@ void AArch64InstPrinter::printPredicateAsCounter(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); if (Reg < AArch64::PN0 || Reg > AArch64::PN15) llvm_unreachable("Unsupported predicate-as-counter register"); O << "pn" << Reg - AArch64::PN0; @@ -1504,9 +1502,9 @@ void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, markup(O, Markup::Immediate) << format("#%.8f", FPImm); } -static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { +static MCRegister getNextVectorRegister(MCRegister Reg, unsigned Stride = 1) { while (Stride--) { - switch (Reg) { + switch (Reg.id()) { default: llvm_unreachable("Vector register expected!"); case AArch64::Q0: Reg = AArch64::Q1; break; @@ -1608,13 +1606,13 @@ void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI, raw_ostream &O) { static_assert(size == 64 || size == 32, "Template parameter must be either 32 or 64"); - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64; unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64; - unsigned Even = MRI.getSubReg(Reg, Sube); - unsigned Odd = MRI.getSubReg(Reg, Subo); + MCRegister Even = MRI.getSubReg(Reg, Sube); + MCRegister Odd = MRI.getSubReg(Reg, Subo); printRegName(O, Even); O << ", "; printRegName(O, Odd); @@ -1649,7 +1647,7 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O, StringRef LayoutSuffix) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); O << "{ "; @@ -1679,13 +1677,13 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, Stride = 4; // Now forget about the list and find out what the first register is. - if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) + if (MCRegister FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) + else if (MCRegister FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::zsub0)) + else if (MCRegister FirstReg = MRI.getSubReg(Reg, AArch64::zsub0)) Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::psub0)) + else if (MCRegister FirstReg = MRI.getSubReg(Reg, AArch64::psub0)) Reg = FirstReg; // If it's a D-reg, we need to promote it to the equivalent Q-reg before @@ -2008,7 +2006,7 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, default: llvm_unreachable("Invalid kind specifier."); } - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); printRegName(O, Reg); if (suffix != 0) O << '.' << suffix; @@ -2090,7 +2088,7 @@ void AArch64InstPrinter::printZPRasFPR(const MCInst *MI, unsigned OpNum, default: llvm_unreachable("Unsupported width"); } - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); printRegName(O, Reg - AArch64::Z0 + Base); } @@ -2108,21 +2106,21 @@ void AArch64InstPrinter::printExactFPImm(const MCInst *MI, unsigned OpNum, void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); printRegName(O, getWRegFromXReg(Reg)); } void AArch64InstPrinter::printGPR64x8(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); printRegName(O, MRI.getSubReg(Reg, AArch64::x8sub_0)); } void AArch64InstPrinter::printSyspXzrPair(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg(); assert(Reg == AArch64::XZR && "MC representation of SyspXzrPair should be XZR"); O << getRegisterName(Reg) << ", " << getRegisterName(Reg); diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index f821bb527aedb8..9faecccb1bd104 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -27,8 +27,8 @@ namespace llvm { -inline static unsigned getWRegFromXReg(unsigned Reg) { - switch (Reg) { +inline static MCRegister getWRegFromXReg(MCRegister Reg) { + switch (Reg.id()) { case AArch64::X0: return AArch64::W0; case AArch64::X1: return AArch64::W1; case AArch64::X2: return AArch64::W2; @@ -67,8 +67,8 @@ inline static unsigned getWRegFromXReg(unsigned Reg) { return Reg; } -inline static unsigned getXRegFromWReg(unsigned Reg) { - switch (Reg) { +inline static MCRegister getXRegFromWReg(MCRegister Reg) { + switch (Reg.id()) { case AArch64::W0: return AArch64::X0; case AArch64::W1: return AArch64::X1; case AArch64::W2: return AArch64::X2; @@ -107,8 +107,8 @@ inline static unsigned getXRegFromWReg(unsigned Reg) { return Reg; } -inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) { - switch (RegTuple) { +inline static MCRegister getXRegFromXRegTuple(MCRegister RegTuple) { + switch (RegTuple.id()) { case AArch64::X0_X1_X2_X3_X4_X5_X6_X7: return AArch64::X0; case AArch64::X2_X3_X4_X5_X6_X7_X8_X9: return AArch64::X2; case AArch64::X4_X5_X6_X7_X8_X9_X10_X11: return AArch64::X4; @@ -126,8 +126,8 @@ inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) { return RegTuple; } -static inline unsigned getBRegFromDReg(unsigned Reg) { - switch (Reg) { +static inline MCRegister getBRegFromDReg(MCRegister Reg) { + switch (Reg.id()) { case AArch64::D0: return AArch64::B0; case AArch64::D1: return AArch64::B1; case AArch64::D2: return AArch64::B2; @@ -165,9 +165,8 @@ static inline unsigned getBRegFromDReg(unsigned Reg) { return Reg; } - -static inline unsigned getDRegFromBReg(unsigned Reg) { - switch (Reg) { +static inline MCRegister getDRegFromBReg(MCRegister Reg) { + switch (Reg.id()) { case AArch64::B0: return AArch64::D0; case AArch64::B1: return AArch64::D1; case AArch64::B2: return AArch64::D2; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 5757ac0d4454d0..919e698e76b33b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1900,6 +1900,10 @@ def isGFX940Plus : Predicate<"Subtarget->hasGFX940Insts()">, AssemblerPredicate<(all_of FeatureGFX940Insts)>; +def isNotGFX940Plus : + Predicate<"!Subtarget->hasGFX940Insts()">, + AssemblerPredicate<(all_of (not FeatureGFX940Insts))>; + def isGFX8GFX9NotGFX940 : Predicate<"!Subtarget->hasGFX940Insts() &&" "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 532ece8b16c5e3..6bdff9862e55ac 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1132,7 +1132,7 @@ let OtherPredicates = [HasGFX10_BEncoding] in { >; } -let SubtargetPredicate = isGFX8GFX9 in { +let SubtargetPredicate = isGFX8GFX9NotGFX940 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; } @@ -1214,7 +1214,7 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < } // End HasD16LoadStore -let SubtargetPredicate = isNotGFX12Plus in +let SubtargetPredicate = isNotGFX940Plus in def BUFFER_WBINVL1 : MUBUF_Invalidate < "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1 >; @@ -1297,6 +1297,7 @@ let SubtargetPredicate = isGFX7Plus in { // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// +let SubtargetPredicate = isNotGFX940Plus in def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol>; diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index a4598de96fa3b4..e7dfbfb64fa521 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -144,10 +144,9 @@ namespace { void ProcessLoop() { std::function Search = [this, &Search](MachineBasicBlock *MBB) -> void { - if (Visited.count(MBB)) + if (!Visited.insert(MBB).second) return; - Visited.insert(MBB); for (auto *Succ : MBB->successors()) { if (!ML.contains(Succ)) continue; diff --git a/llvm/lib/Target/Mips/Mips.h b/llvm/lib/Target/Mips/Mips.h index e3e9e171916704..f99dadd25e34d6 100644 --- a/llvm/lib/Target/Mips/Mips.h +++ b/llvm/lib/Target/Mips/Mips.h @@ -17,6 +17,17 @@ #include "MCTargetDesc/MipsMCTargetDesc.h" #include "llvm/Target/TargetMachine.h" +#define IsMFLOMFHI(instr) \ + (instr == Mips::MFLO || instr == Mips::MFLO64 || instr == Mips::MFHI || \ + instr == Mips::MFHI64) +#define IsDIVMULT(instr) \ + (instr == Mips::SDIV || instr == Mips::PseudoSDIV || instr == Mips::DSDIV || \ + instr == Mips::PseudoDSDIV || instr == Mips::UDIV || \ + instr == Mips::PseudoUDIV || instr == Mips::DUDIV || \ + instr == Mips::PseudoDUDIV || instr == Mips::MULT || \ + instr == Mips::PseudoMULT || instr == Mips::DMULT || \ + instr == Mips::PseudoDMULT) + namespace llvm { class FunctionPass; class InstructionSelector; diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp index 721e525331c6ce..5d01c698b3e04e 100644 --- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp +++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp @@ -167,6 +167,9 @@ class MipsBranchExpansion : public MachineFunctionPass { bool handleFPUDelaySlot(); bool handleLoadDelaySlot(); bool handlePossibleLongBranch(); + bool handleMFLO(); + template + bool handleMFLOSlot(Pred Predicate, Safe SafeInSlot); const MipsSubtarget *STI; const MipsInstrInfo *TII; @@ -741,6 +744,53 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) { MBB.removeLiveIn(Mips::V0); } +template +bool MipsBranchExpansion::handleMFLOSlot(Pred Predicate, Safe SafeInSlot) { + bool Changed = false; + bool hasPendingMFLO = false; + + for (MachineFunction::iterator FI = MFp->begin(); FI != MFp->end(); ++FI) { + for (Iter I = FI->begin(); I != FI->end(); ++I) { + + if (!Predicate(*I) && !hasPendingMFLO) { + continue; + } + + Iter IInSlot; + bool LastInstInFunction = + std::next(I) == FI->end() && std::next(FI) == MFp->end(); + // We need process several situations: + // mflo is last instruction, do not process; + // mflo + div, add two nop between them; + // mflo + none-div + none-div, do not process; + // mflo + none-div + div, add nop between none-div and div. + if (!LastInstInFunction) { + std::pair Res = getNextMachineInstr(std::next(I), &*FI); + LastInstInFunction |= Res.second; + IInSlot = Res.first; + if (!SafeInSlot(*IInSlot, *I)) { + Changed = true; + TII->insertNop(*(I->getParent()), std::next(I), I->getDebugLoc()) + ->bundleWithPred(); + NumInsertedNops++; + if (IsMFLOMFHI(I->getOpcode())) { + TII->insertNop(*(I->getParent()), std::next(I), I->getDebugLoc()) + ->bundleWithPred(); + NumInsertedNops++; + } + if (hasPendingMFLO) + hasPendingMFLO = false; + } else if (hasPendingMFLO) + hasPendingMFLO = false; + else if (IsMFLOMFHI(I->getOpcode())) + hasPendingMFLO = true; + } + } + } + + return Changed; +} + template bool MipsBranchExpansion::handleSlot(Pred Predicate, Safe SafeInSlot) { bool Changed = false; @@ -777,6 +827,19 @@ bool MipsBranchExpansion::handleSlot(Pred Predicate, Safe SafeInSlot) { return Changed; } +bool MipsBranchExpansion::handleMFLO() { + // mips1-4 require a minimum of 2 instructions between a mflo/mfhi + // and the next mul/div instruction. + if (STI->hasMips32() || STI->hasMips5()) + return false; + + return handleMFLOSlot( + [this](auto &I) -> bool { return TII->IsMfloOrMfhi(I); }, + [this](auto &IInSlot, auto &I) -> bool { + return TII->SafeAfterMflo(IInSlot); + }); +} + bool MipsBranchExpansion::handleForbiddenSlot() { // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6. if (!STI->hasMips32r6() || STI->inMicroMipsMode()) @@ -893,16 +956,19 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) { bool forbiddenSlotChanged = handleForbiddenSlot(); bool fpuDelaySlotChanged = handleFPUDelaySlot(); bool loadDelaySlotChanged = handleLoadDelaySlot(); + bool MfloChanged = handleMFLO(); bool Changed = longBranchChanged || forbiddenSlotChanged || - fpuDelaySlotChanged || loadDelaySlotChanged; + fpuDelaySlotChanged || loadDelaySlotChanged || MfloChanged; // Then run them alternatively while there are changes. while (forbiddenSlotChanged) { longBranchChanged = handlePossibleLongBranch(); fpuDelaySlotChanged = handleFPUDelaySlot(); loadDelaySlotChanged = handleLoadDelaySlot(); - if (!longBranchChanged && !fpuDelaySlotChanged && !loadDelaySlotChanged) + MfloChanged = handleMFLO(); + if (!longBranchChanged && !fpuDelaySlotChanged && !loadDelaySlotChanged && + !MfloChanged) break; forbiddenSlotChanged = handleForbiddenSlot(); } diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index 4ec01ab7b45659..a576c531c8c013 100644 --- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -744,6 +744,12 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin, bool InMicroMipsMode = STI.inMicroMipsMode(); const MipsInstrInfo *TII = STI.getInstrInfo(); unsigned Opcode = (*Slot).getOpcode(); + + // In mips1-4, should not put mflo into the delay slot for the return. + if ((IsMFLOMFHI(CurrI->getOpcode())) && + (!STI.hasMips32() && !STI.hasMips5())) + continue; + // This is complicated by the tail call optimization. For non-PIC code // there is only a 32bit sized unconditional branch which can be assumed // to be able to reach the target. b16 only has a range of +/- 1 KB. diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index f4fba5e53132df..d33652b4d2e3ab 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -13,6 +13,7 @@ #include "MipsInstrInfo.h" #include "MCTargetDesc/MipsBaseInfo.h" #include "MCTargetDesc/MipsMCTargetDesc.h" +#include "Mips.h" #include "MipsSubtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -571,6 +572,13 @@ unsigned MipsInstrInfo::getEquivalentCompactForm( return 0; } +bool MipsInstrInfo::SafeAfterMflo(const MachineInstr &MI) const { + if (IsDIVMULT(MI.getOpcode())) + return false; + + return true; +} + /// Predicate for distingushing between control transfer instructions and all /// other instructions for handling forbidden slots. Consider inline assembly /// as unsafe as well. @@ -623,6 +631,13 @@ bool MipsInstrInfo::SafeInLoadDelaySlot(const MachineInstr &MIInSlot, }); } +bool MipsInstrInfo::IsMfloOrMfhi(const MachineInstr &MI) const { + if (IsMFLOMFHI(MI.getOpcode())) + return true; + + return false; +} + /// Predicate for distingushing instructions that have forbidden slots. bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const { return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0; diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 4e039e0e32aba6..2ff12f80b1714d 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -89,6 +89,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; + bool SafeAfterMflo(const MachineInstr &MI) const; + /// Predicate to determine if an instruction can go in a forbidden slot. bool SafeInForbiddenSlot(const MachineInstr &MI) const; @@ -100,6 +102,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { bool SafeInLoadDelaySlot(const MachineInstr &MIInSlot, const MachineInstr &LoadMI) const; + bool IsMfloOrMfhi(const MachineInstr &MI) const; + /// Predicate to determine if an instruction has a forbidden slot. bool HasForbiddenSlot(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 5b568b0487b45a..7d6442a611125f 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "NVPTXUtilities.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -95,228 +96,262 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier) { + const char *M) { const MCOperand &MO = MI->getOperand(OpNum); int64_t Imm = MO.getImm(); + llvm::StringRef Modifier(M); - if (strcmp(Modifier, "ftz") == 0) { + if (Modifier == "ftz") { // FTZ flag if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG) O << ".ftz"; - } else if (strcmp(Modifier, "sat") == 0) { + return; + } else if (Modifier == "sat") { // SAT flag if (Imm & NVPTX::PTXCvtMode::SAT_FLAG) O << ".sat"; - } else if (strcmp(Modifier, "relu") == 0) { + return; + } else if (Modifier == "relu") { // RELU flag if (Imm & NVPTX::PTXCvtMode::RELU_FLAG) O << ".relu"; - } else if (strcmp(Modifier, "base") == 0) { + return; + } else if (Modifier == "base") { // Default operand switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) { default: return; case NVPTX::PTXCvtMode::NONE: - break; + return; case NVPTX::PTXCvtMode::RNI: O << ".rni"; - break; + return; case NVPTX::PTXCvtMode::RZI: O << ".rzi"; - break; + return; case NVPTX::PTXCvtMode::RMI: O << ".rmi"; - break; + return; case NVPTX::PTXCvtMode::RPI: O << ".rpi"; - break; + return; case NVPTX::PTXCvtMode::RN: O << ".rn"; - break; + return; case NVPTX::PTXCvtMode::RZ: O << ".rz"; - break; + return; case NVPTX::PTXCvtMode::RM: O << ".rm"; - break; + return; case NVPTX::PTXCvtMode::RP: O << ".rp"; - break; + return; case NVPTX::PTXCvtMode::RNA: O << ".rna"; - break; + return; } - } else { - llvm_unreachable("Invalid conversion modifier"); } + llvm_unreachable("Invalid conversion modifier"); } void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier) { + const char *M) { const MCOperand &MO = MI->getOperand(OpNum); int64_t Imm = MO.getImm(); + llvm::StringRef Modifier(M); - if (strcmp(Modifier, "ftz") == 0) { + if (Modifier == "ftz") { // FTZ flag if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG) O << ".ftz"; - } else if (strcmp(Modifier, "base") == 0) { + return; + } else if (Modifier == "base") { switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) { default: return; case NVPTX::PTXCmpMode::EQ: O << ".eq"; - break; + return; case NVPTX::PTXCmpMode::NE: O << ".ne"; - break; + return; case NVPTX::PTXCmpMode::LT: O << ".lt"; - break; + return; case NVPTX::PTXCmpMode::LE: O << ".le"; - break; + return; case NVPTX::PTXCmpMode::GT: O << ".gt"; - break; + return; case NVPTX::PTXCmpMode::GE: O << ".ge"; - break; + return; case NVPTX::PTXCmpMode::LO: O << ".lo"; - break; + return; case NVPTX::PTXCmpMode::LS: O << ".ls"; - break; + return; case NVPTX::PTXCmpMode::HI: O << ".hi"; - break; + return; case NVPTX::PTXCmpMode::HS: O << ".hs"; - break; + return; case NVPTX::PTXCmpMode::EQU: O << ".equ"; - break; + return; case NVPTX::PTXCmpMode::NEU: O << ".neu"; - break; + return; case NVPTX::PTXCmpMode::LTU: O << ".ltu"; - break; + return; case NVPTX::PTXCmpMode::LEU: O << ".leu"; - break; + return; case NVPTX::PTXCmpMode::GTU: O << ".gtu"; - break; + return; case NVPTX::PTXCmpMode::GEU: O << ".geu"; - break; + return; case NVPTX::PTXCmpMode::NUM: O << ".num"; - break; + return; case NVPTX::PTXCmpMode::NotANumber: O << ".nan"; - break; + return; } - } else { - llvm_unreachable("Empty Modifier"); } + llvm_unreachable("Empty Modifier"); } void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { - if (Modifier) { - const MCOperand &MO = MI->getOperand(OpNum); - int Imm = (int) MO.getImm(); - if (!strcmp(Modifier, "sem")) { - auto Ordering = NVPTX::Ordering(Imm); - switch (Ordering) { - case NVPTX::Ordering::NotAtomic: - break; - case NVPTX::Ordering::Volatile: - O << ".volatile"; - break; - case NVPTX::Ordering::Relaxed: - O << ".relaxed.sys"; - break; - case NVPTX::Ordering::Acquire: - O << ".acquire.sys"; - break; - case NVPTX::Ordering::Release: - O << ".release.sys"; - break; - case NVPTX::Ordering::RelaxedMMIO: - O << ".mmio.relaxed.sys"; - break; - default: - report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" sem modifier.", - OrderingToCString(Ordering))); - } - } else if (!strcmp(Modifier, "addsp")) { - switch (Imm) { - case NVPTX::PTXLdStInstCode::GLOBAL: - O << ".global"; - break; - case NVPTX::PTXLdStInstCode::SHARED: - O << ".shared"; - break; - case NVPTX::PTXLdStInstCode::LOCAL: - O << ".local"; - break; - case NVPTX::PTXLdStInstCode::PARAM: - O << ".param"; - break; - case NVPTX::PTXLdStInstCode::CONSTANT: - O << ".const"; - break; - case NVPTX::PTXLdStInstCode::GENERIC: - break; - default: - llvm_unreachable("Wrong Address Space"); - } - } else if (!strcmp(Modifier, "sign")) { - if (Imm == NVPTX::PTXLdStInstCode::Signed) - O << "s"; - else if (Imm == NVPTX::PTXLdStInstCode::Unsigned) - O << "u"; - else if (Imm == NVPTX::PTXLdStInstCode::Untyped) - O << "b"; - else if (Imm == NVPTX::PTXLdStInstCode::Float) - O << "f"; - else - llvm_unreachable("Unknown register type"); - } else if (!strcmp(Modifier, "vec")) { - if (Imm == NVPTX::PTXLdStInstCode::V2) - O << ".v2"; - else if (Imm == NVPTX::PTXLdStInstCode::V4) - O << ".v4"; - } else - llvm_unreachable("Unknown Modifier"); - } else - llvm_unreachable("Empty Modifier"); + raw_ostream &O, const char *M) { + llvm::StringRef Modifier(M); + const MCOperand &MO = MI->getOperand(OpNum); + int Imm = (int)MO.getImm(); + if (Modifier == "sem") { + auto Ordering = NVPTX::Ordering(Imm); + switch (Ordering) { + case NVPTX::Ordering::NotAtomic: + return; + case NVPTX::Ordering::Relaxed: + O << ".relaxed"; + return; + case NVPTX::Ordering::Acquire: + O << ".acquire"; + return; + case NVPTX::Ordering::Release: + O << ".release"; + return; + case NVPTX::Ordering::Volatile: + O << ".volatile"; + return; + case NVPTX::Ordering::RelaxedMMIO: + O << ".mmio.relaxed"; + return; + default: + report_fatal_error(formatv( + "NVPTX LdStCode Printer does not support \"{}\" sem modifier. " + "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.", + OrderingToString(Ordering))); + } + } else if (Modifier == "scope") { + auto S = NVPTX::Scope(Imm); + switch (S) { + case NVPTX::Scope::Thread: + return; + case NVPTX::Scope::System: + O << ".sys"; + return; + case NVPTX::Scope::Block: + O << ".cta"; + return; + case NVPTX::Scope::Cluster: + O << ".cluster"; + return; + case NVPTX::Scope::Device: + O << ".gpu"; + return; + } + report_fatal_error( + formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.", + ScopeToString(S))); + } else if (Modifier == "addsp") { + auto A = NVPTX::AddressSpace(Imm); + switch (A) { + case NVPTX::AddressSpace::Generic: + return; + case NVPTX::AddressSpace::Global: + case NVPTX::AddressSpace::Const: + case NVPTX::AddressSpace::Shared: + case NVPTX::AddressSpace::Param: + case NVPTX::AddressSpace::Local: + O << "." << A; + return; + } + report_fatal_error(formatv( + "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.", + AddressSpaceToString(A))); + } else if (Modifier == "sign") { + switch (Imm) { + case NVPTX::PTXLdStInstCode::Signed: + O << "s"; + return; + case NVPTX::PTXLdStInstCode::Unsigned: + O << "u"; + return; + case NVPTX::PTXLdStInstCode::Untyped: + O << "b"; + return; + case NVPTX::PTXLdStInstCode::Float: + O << "f"; + return; + default: + llvm_unreachable("Unknown register type"); + } + } else if (Modifier == "vec") { + switch (Imm) { + case NVPTX::PTXLdStInstCode::V2: + O << ".v2"; + return; + case NVPTX::PTXLdStInstCode::V4: + O << ".v4"; + return; + } + // TODO: evaluate whether cases not covered by this switch are bugs + return; + } + llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str()); } void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier) { + const char *M) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); - if (Modifier == nullptr || strcmp(Modifier, "version") == 0) { + llvm::StringRef Modifier(M); + if (Modifier.empty() || Modifier == "version") { O << Imm; // Just print out PTX version - } else if (strcmp(Modifier, "aligned") == 0) { + return; + } else if (Modifier == "aligned") { // PTX63 requires '.aligned' in the name of the instruction. if (Imm >= 63) O << ".aligned"; - } else - llvm_unreachable("Unknown Modifier"); + return; + } + llvm_unreachable("Unknown Modifier"); } void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O, const char *M) { printOperand(MI, OpNum, O); + llvm::StringRef Modifier(M); - if (Modifier && !strcmp(Modifier, "add")) { + if (Modifier == "add") { O << ", "; printOperand(MI, OpNum + 1, O); } else { @@ -346,24 +381,24 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum, default: return; case NVPTX::PTXPrmtMode::NONE: - break; + return; case NVPTX::PTXPrmtMode::F4E: O << ".f4e"; - break; + return; case NVPTX::PTXPrmtMode::B4E: O << ".b4e"; - break; + return; case NVPTX::PTXPrmtMode::RC8: O << ".rc8"; - break; + return; case NVPTX::PTXPrmtMode::ECL: O << ".ecl"; - break; + return; case NVPTX::PTXPrmtMode::ECR: O << ".ecr"; - break; + return; case NVPTX::PTXPrmtMode::RC16: O << ".rc16"; - break; + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index f6f6acb9e13c90..f6ab81d3ca0bb2 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -117,23 +117,37 @@ enum Ordering : OrderingUnderlyingType { // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire, Release = (OrderingUnderlyingType)AtomicOrdering::Release, - // AcquireRelease = 6, // TODO + AcquireRelease = (OrderingUnderlyingType)AtomicOrdering::AcquireRelease, SequentiallyConsistent = (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent, Volatile = SequentiallyConsistent + 1, RelaxedMMIO = Volatile + 1, - LAST = RelaxedMMIO + LASTORDERING = RelaxedMMIO }; -namespace PTXLdStInstCode { -enum AddressSpace { - GENERIC = 0, - GLOBAL = 1, - CONSTANT = 2, - SHARED = 3, - PARAM = 4, - LOCAL = 5 +using ScopeUnderlyingType = unsigned int; +enum Scope : ScopeUnderlyingType { + Thread = 0, + Block = 1, + Cluster = 2, + Device = 3, + System = 4, + LASTSCOPE = System +}; + +using AddressSpaceUnderlyingType = unsigned int; +enum AddressSpace : AddressSpaceUnderlyingType { + Generic = 0, + Global = 1, + Shared = 3, + Const = 4, + Local = 5, + + // NVPTX Backend Private: + Param = 101 }; + +namespace PTXLdStInstCode { enum FromType { Unsigned = 0, Signed, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4f0bc1a2044642..56c96ea943b89d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -59,6 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget(); + Scopes = NVPTXScopes(MF.getFunction().getContext()); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -106,6 +107,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStore(N)) return; break; + case ISD::ATOMIC_FENCE: + if (tryFence(N)) + return; + break; case ISD::EXTRACT_VECTOR_ELT: if (tryEXTRACT_VECTOR_ELEMENT(N)) return; @@ -699,20 +704,26 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { const Value *Src = N->getMemOperand()->getValue(); if (!Src) - return NVPTX::PTXLdStInstCode::GENERIC; + return NVPTX::AddressSpace::Generic; if (auto *PT = dyn_cast(Src->getType())) { switch (PT->getAddressSpace()) { - case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; - case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; - case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; - case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; - case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; - case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; + case llvm::ADDRESS_SPACE_LOCAL: + return NVPTX::AddressSpace::Local; + case llvm::ADDRESS_SPACE_GLOBAL: + return NVPTX::AddressSpace::Global; + case llvm::ADDRESS_SPACE_SHARED: + return NVPTX::AddressSpace::Shared; + case llvm::ADDRESS_SPACE_GENERIC: + return NVPTX::AddressSpace::Generic; + case llvm::ADDRESS_SPACE_PARAM: + return NVPTX::AddressSpace::Param; + case llvm::ADDRESS_SPACE_CONST: + return NVPTX::AddressSpace::Const; default: break; } } - return NVPTX::PTXLdStInstCode::GENERIC; + return NVPTX::AddressSpace::Generic; } namespace { @@ -815,9 +826,9 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // - the "weak" memory instruction we are currently lowering to, and // - some other instruction that preserves the side-effect, e.g., // a dead dummy volatile load. - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || - CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || - CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { + if (CodeAddrSpace == NVPTX::AddressSpace::Local || + CodeAddrSpace == NVPTX::AddressSpace::Const || + CodeAddrSpace == NVPTX::AddressSpace::Param) { return NVPTX::Ordering::NotAtomic; } @@ -842,14 +853,14 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // atomics is undefined if the generic address does not refer to a .global or // .shared memory location. bool AddrGenericOrGlobalOrShared = - (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || - CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || - CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + (CodeAddrSpace == NVPTX::AddressSpace::Generic || + CodeAddrSpace == NVPTX::AddressSpace::Global || + CodeAddrSpace == NVPTX::AddressSpace::Shared); if (!AddrGenericOrGlobalOrShared) return NVPTX::Ordering::NotAtomic; bool UseRelaxedMMIO = - HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; + HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global; switch (Ordering) { case AtomicOrdering::NotAtomic: @@ -915,6 +926,40 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { } // namespace +NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, + NVPTX::Ordering O) const { + switch (O) { + case NVPTX::Ordering::NotAtomic: + case NVPTX::Ordering::Volatile: // Non-atomic volatile operations + // NVPTX uses Thread scope as the scope of non-atomic operations. + return NVPTX::Scope::Thread; + case NVPTX::Ordering::RelaxedMMIO: + // RelaxedMMIO operations are always system scope. + // If a RelaxedMMIO order was generated from an atomic volatile operation + // with a smaller thread scope, we bump it here to system scope. + return NVPTX::Scope::System; + case NVPTX::Ordering::Relaxed: + case NVPTX::Ordering::Acquire: + case NVPTX::Ordering::Release: + case NVPTX::Ordering::AcquireRelease: + case NVPTX::Ordering::SequentiallyConsistent: + auto S = Scopes[N->getSyncScopeID()]; + + // Atomic operations must have a scope greater than thread. + if (S == NVPTX::Scope::Thread) + report_fatal_error( + formatv("Atomics need scope > \"{}\".", ScopeToString(S))); + + // If scope is cluster, clusters must be supported. + if (S == NVPTX::Scope::Cluster) + Subtarget->failIfClustersUnsupported("cluster scope"); + + // If operation is volatile, then its scope is system. + return N->isVolatile() ? NVPTX::Scope::System : S; + } + llvm_unreachable("unhandled ordering"); +} + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address @@ -934,7 +979,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for // explicitly invariant loads because these are how clang tells us to use ldg // when the user uses a builtin. - if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) + if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global) return false; if (N->isInvariant()) @@ -957,33 +1002,87 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, }); } -NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, - SDValue &Chain, - MemSDNode *N) { - // Some memory instructions - loads, stores, atomics - need an extra fence - // instruction. Get the memory order of the instruction, and that of its - // fence, if any. +static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, + NVPTXSubtarget const *T) { + if (S == NVPTX::Scope::Cluster) + T->failIfClustersUnsupported(".cluster scope fence"); + + switch (O) { + case NVPTX::Ordering::Acquire: + case NVPTX::Ordering::Release: + case NVPTX::Ordering::AcquireRelease: { + switch (S) { + case NVPTX::Scope::System: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys + : NVPTX::INT_MEMBAR_SYS; + case NVPTX::Scope::Block: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta + : NVPTX::INT_MEMBAR_CTA; + case NVPTX::Scope::Cluster: + return NVPTX::atomic_thread_fence_acq_rel_cluster; + case NVPTX::Scope::Device: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu + : NVPTX::INT_MEMBAR_GL; + case NVPTX::Scope::Thread: + report_fatal_error( + formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", + ScopeToString(S))); + } + } + case NVPTX::Ordering::SequentiallyConsistent: { + switch (S) { + case NVPTX::Scope::System: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::INT_MEMBAR_SYS; + case NVPTX::Scope::Block: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta + : NVPTX::INT_MEMBAR_CTA; + case NVPTX::Scope::Cluster: + return NVPTX::atomic_thread_fence_seq_cst_cluster; + case NVPTX::Scope::Device: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu + : NVPTX::INT_MEMBAR_GL; + case NVPTX::Scope::Thread: + report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.", + ScopeToString(S))); + } + } + case NVPTX::Ordering::NotAtomic: + case NVPTX::Ordering::Relaxed: + case NVPTX::Ordering::Volatile: + case NVPTX::Ordering::RelaxedMMIO: + report_fatal_error( + formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.", + OrderingToString(O), ScopeToString(S))); + } + llvm_unreachable("unhandled ordering"); +} + +// Returns Memory Order and Scope of a memory instruction, and +// inserts any fence before the instruction that's required to +// implement its memory ordering. +std::pair +NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, + MemSDNode *N) { auto [InstructionOrdering, FenceOrdering] = getOperationOrderings(N, Subtarget); + auto Scope = getOperationScope(N, InstructionOrdering); // If a fence is required before the operation, insert it: switch (NVPTX::Ordering(FenceOrdering)) { case NVPTX::Ordering::NotAtomic: break; case NVPTX::Ordering::SequentiallyConsistent: { - unsigned Op = Subtarget->hasMemoryOrdering() - ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::INT_MEMBAR_SYS; + auto Op = getFenceOp(FenceOrdering, Scope, Subtarget); Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); break; } default: report_fatal_error( formatv("Unexpected fence ordering: \"{}\".", - OrderingToCString(NVPTX::Ordering(FenceOrdering)))); + OrderingToString(NVPTX::Ordering(FenceOrdering)))); } - - return InstructionOrdering; + return {InstructionOrdering, Scope}; } bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { @@ -1154,7 +1253,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); - auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD); + auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD); // Type Setting: fromType + fromTypeWidth // @@ -1189,7 +1288,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { std::optional Opcode; MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; - SmallVector Ops({getI32Imm(InstructionOrdering, DL), + SmallVector Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL)}); @@ -1266,7 +1365,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); - auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); + auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1319,7 +1418,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { std::optional Opcode; SDNode *LD; - SmallVector Ops({getI32Imm(InstructionOrdering, DL), + SmallVector Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL)}); @@ -1895,7 +1994,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { SDLoc DL(N); SDValue Chain = ST->getChain(); - auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST); + auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -1923,10 +2022,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT::SimpleValueType SourceVT = Value.getNode()->getSimpleValueType(0).SimpleTy; - SmallVector Ops({Value, getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(ToType, DL), - getI32Imm(ToTypeWidth, DL)}); + SmallVector Ops( + {Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL), + getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), + getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, @@ -1996,7 +2095,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { // Address Space Setting unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { + if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); } @@ -2005,7 +2104,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); - auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); + auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -2044,9 +2143,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(ToType, DL), - getI32Imm(ToTypeWidth, DL)}); + Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), + getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), + getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -4064,3 +4163,41 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, } } } + +bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { + SDLoc DL(N); + assert(N->getOpcode() == ISD::ATOMIC_FENCE); + unsigned int FenceOp = + getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)), + Scopes[N->getConstantOperandVal(2)], Subtarget); + SDValue Chain = N->getOperand(0); + SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain); + ReplaceNode(N, FenceNode); + return true; +} + +NVPTXScopes::NVPTXScopes(LLVMContext &C) { + Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread; + Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System; + Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block; + Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster; + Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device; +} + +NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { + if (Scopes.empty()) + llvm_unreachable("NVPTX Scopes must be initialized before calling " + "NVPTXScopes::operator[]"); + + auto S = Scopes.find(ID); + if (S == Scopes.end()) { + // TODO: + // - Add API to LLVMContext to get the name of a single scope. + // - Use that API here to print an error containing the name + // of this Unknown ID. + report_fatal_error(formatv("Could not find scope ID={}.", int(ID))); + } + return S->second; +} + +bool NVPTXScopes::empty() const { return Scopes.size() == 0; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index eac4056599511c..c128c082c29837 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -18,13 +18,25 @@ #include "NVPTXISelLowering.h" #include "NVPTXRegisterInfo.h" #include "NVPTXTargetMachine.h" +#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Compiler.h" namespace llvm { +struct NVPTXScopes { + NVPTXScopes() = default; + NVPTXScopes(LLVMContext &C); + NVPTX::Scope operator[](SyncScope::ID ID) const; + bool empty() const; + +private: + SmallMapVector Scopes{}; +}; + class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; @@ -38,6 +50,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool allowUnsafeFPMath() const; bool doRsqrtOpt() const; + NVPTXScopes Scopes{}; + public: NVPTXDAGToDAGISel() = delete; @@ -66,6 +80,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryLoadParam(SDNode *N); bool tryStoreRetval(SDNode *N); bool tryStoreParam(SDNode *N); + bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryTextureIntrinsic(SDNode *N); bool trySurfaceIntrinsic(SDNode *N); @@ -100,8 +115,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N); - NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, - MemSDNode *N); + // Returns the Memory Order and Scope that the PTX memory instruction should + // use, and inserts appropriate fence instruction before the memory + // instruction, if needed to implement the instructions memory order. Required + // fences after the instruction need to be handled elsewhere. + std::pair + insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N); + NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const; }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b7e210805db904..510e4b81003119 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2971,39 +2971,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; } @@ -3019,39 +3019,42 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, + i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, + i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, + i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; } @@ -3070,75 +3073,75 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3153,84 +3156,87 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int32Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, - i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int32Regs:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, - i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int64Regs:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, - i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + imem:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}" "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } @@ -4003,17 +4009,23 @@ def atomic_thread_fence_acq_rel_sys : NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>, Requires<[hasPTX<60>, hasSM<70>]>; -def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1) - Requires<[hasPTX<60>, hasSM<70>]>; -def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1) - Requires<[hasPTX<60>, hasSM<70>]>; -def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1) - Requires<[hasPTX<60>, hasSM<70>]>; -def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1) - Requires<[hasPTX<60>, hasSM<70>]>; - -// If PTX<60 or SM<70, we fall back to MEMBAR: -def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1) -def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1) -def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1) -def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1) +def atomic_thread_fence_seq_cst_gpu : + NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; +def atomic_thread_fence_acq_rel_gpu : + NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; + +def atomic_thread_fence_seq_cst_cluster : + NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>, + Requires<[hasPTX<78>, hasSM<90>]>; +def atomic_thread_fence_acq_rel_cluster : + NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>, + Requires<[hasPTX<78>, hasSM<90>]>; + +def atomic_thread_fence_seq_cst_cta : + NVPTXInst<(outs), (ins), "fence.sc.cta;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; +def atomic_thread_fence_acq_rel_cta : + NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; \ No newline at end of file diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 656fc679a572aa..56c551661151d7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1577,20 +1577,6 @@ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a), def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a), (CVT_f16x2_e5m2x2 Int16Regs:$a, CvtRN_RELU)>; -// -// Bitcast -// - -def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs, - Float32Regs, int_nvvm_bitcast_f2i>; -def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs, - Int32Regs, int_nvvm_bitcast_i2f>; - -def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs, - Int64Regs, int_nvvm_bitcast_ll2d>; -def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs, - Float64Regs, int_nvvm_bitcast_d2ll>; - // // FNS // diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index f2515f971595bf..f66504b09cb63f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -11,12 +11,11 @@ // to work reliably, inlining of all function call must be performed. // //===----------------------------------------------------------------------===// - +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "NVPTXMachineFunctionInfo.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -1820,8 +1819,8 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) { return false; } - assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!"); - StringRef Sym = TexHandleDef.getOperand(6).getSymbolName(); + assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!"); + StringRef Sym = TexHandleDef.getOperand(7).getSymbolName(); std::string ParamBaseName = std::string(MF.getName()); ParamBaseName += "_param_"; assert(Sym.starts_with(ParamBaseName) && "Invalid symbol reference"); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 420065585b3849..0e6b75e622c6ad 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,6 +12,8 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -69,3 +71,14 @@ bool NVPTXSubtarget::hasImageHandles() const { bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } + +void NVPTXSubtarget::failIfClustersUnsupported( + std::string const &FailureMessage) const { + if (hasClusters()) + return; + + report_fatal_error(formatv( + "NVPTX SM architecture \"{}\" and PTX version \"{}\" do not support {}. " + "Requires SM >= 90 and PTX >= 78.", + getFullSmVersion(), PTXVersion, FailureMessage)); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 457f10f1d64a26..8b9059bd60cbd4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -78,6 +78,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } + bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } bool hasLDG() const { return SmVersion >= 32; } bool hasHWROT32() const { return SmVersion >= 32; } bool hasImageHandles() const; @@ -119,6 +120,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + + void failIfClustersUnsupported(std::string const &FailureMessage) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index eebd91fefe4f03..938b9b04b7a449 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -20,6 +20,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/FormatVariadic.h" #include #include #include @@ -86,7 +87,7 @@ bool Isv2x16VT(EVT VT); namespace NVPTX { -inline std::string OrderingToCString(Ordering Order) { +inline std::string OrderingToString(Ordering Order) { switch (Order) { case Ordering::NotAtomic: return "NotAtomic"; @@ -96,7 +97,8 @@ inline std::string OrderingToCString(Ordering Order) { return "Acquire"; case Ordering::Release: return "Release"; - // case Ordering::AcquireRelease: return "AcquireRelease"; + case Ordering::AcquireRelease: + return "AcquireRelease"; case Ordering::SequentiallyConsistent: return "SequentiallyConsistent"; case Ordering::Volatile: @@ -104,11 +106,58 @@ inline std::string OrderingToCString(Ordering Order) { case Ordering::RelaxedMMIO: return "RelaxedMMIO"; } - report_fatal_error("unknown ordering"); + report_fatal_error(formatv("Unknown NVPTX::Ordering \"{}\".", + static_cast(Order))); } inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { - O << OrderingToCString(Order); + O << OrderingToString(Order); + return O; +} + +inline std::string ScopeToString(Scope S) { + switch (S) { + case Scope::Thread: + return "Thread"; + case Scope::System: + return "System"; + case Scope::Block: + return "Block"; + case Scope::Cluster: + return "Cluster"; + case Scope::Device: + return "Device"; + } + report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".", + static_cast(S))); +} + +inline raw_ostream &operator<<(raw_ostream &O, Scope S) { + O << ScopeToString(S); + return O; +} + +inline std::string AddressSpaceToString(AddressSpace A) { + switch (A) { + case AddressSpace::Generic: + return "generic"; + case AddressSpace::Global: + return "global"; + case AddressSpace::Const: + return "const"; + case AddressSpace::Shared: + return "shared"; + case AddressSpace::Param: + return "param"; + case AddressSpace::Local: + return "local"; + } + report_fatal_error(formatv("Unknown NVPTX::AddressSpace \"{}\".", + static_cast(A))); +} + +inline raw_ostream &operator<<(raw_ostream &O, AddressSpace A) { + O << AddressSpaceToString(A); return O; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index bdc4d4dd623da5..768df71715fa63 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -635,8 +635,17 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( InstructionCost Cost = BaseT::getScalarizationOverhead( Ty, DemandedElts, Insert, Extract, CostKind); std::pair LT = getTypeLegalizationCost(Ty); - if (Insert && !Extract && LT.first.isValid() && LT.second.isVector() && - Ty->getScalarSizeInBits() != 1) { + if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) { + if (Ty->getScalarSizeInBits() == 1) { + auto *WideVecTy = cast(Ty->getWithNewBitWidth(8)); + // Note: Implicit scalar anyextend is assumed to be free since the i1 + // must be stored in a GPR. + return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract, + CostKind) + + getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, + TTI::CastContextHint::None, CostKind, nullptr); + } + assert(LT.second.isFixedLengthVector()); MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second); if (isM1OrSmaller(ContainerVT)) { diff --git a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp index 322e051a87db1a..246eecd4ffcaa1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp @@ -127,7 +127,8 @@ void SPIRVRegularizer::runLowerConstExpr(Function &F) { ReplList.push_back(Inst); Repl = InsertElementInst::Create( (Repl ? Repl : PoisonValue::get(Vec->getType())), V, - ConstantInt::get(Type::getInt32Ty(Ctx), Idx++), "", InsPoint); + ConstantInt::get(Type::getInt32Ty(Ctx), Idx++), "", + InsPoint->getIterator()); } WorkList.splice(WorkList.begin(), ReplList); return Repl; @@ -234,11 +235,12 @@ void SPIRVRegularizer::visitCallScalToVec(CallInst *CI, StringRef MangledName, // %call = OpExtInst %v2uint %1 s_min %14 %11 auto ConstInt = ConstantInt::get(IntegerType::get(CI->getContext(), 32), 0); PoisonValue *PVal = PoisonValue::get(Arg0Ty); - Instruction *Inst = - InsertElementInst::Create(PVal, CI->getOperand(1), ConstInt, "", CI); + Instruction *Inst = InsertElementInst::Create( + PVal, CI->getOperand(1), ConstInt, "", CI->getIterator()); ElementCount VecElemCount = cast(Arg0Ty)->getElementCount(); Constant *ConstVec = ConstantVector::getSplat(VecElemCount, ConstInt); - Value *NewVec = new ShuffleVectorInst(Inst, PVal, ConstVec, "", CI); + Value *NewVec = + new ShuffleVectorInst(Inst, PVal, ConstVec, "", CI->getIterator()); CI->setOperand(1, NewVec); CI->replaceUsesOfWith(OldF, NewF); CI->mutateFunctionType(NewF->getFunctionType()); diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index b632d784977678..c87048b93f80fc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -62,7 +62,7 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass { return; auto *NewCall = CallBase::removeOperandBundle( - CI, LLVMContext::OB_convergencectrl, CI); + CI, LLVMContext::OB_convergencectrl, CI->getIterator()); NewCall->copyMetadata(*CI); CI->replaceAllUsesWith(NewCall); ToRemove.insert(CI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f0a23c991c7ce8..b2e5d727555327 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -336,9 +336,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); } } if (Subtarget.hasAVX10_2()) { @@ -358,6 +360,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. @@ -721,6 +726,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); @@ -937,6 +943,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); } setOperationAction(ISD::SETCC, MVT::f128, Custom); @@ -1070,9 +1077,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); @@ -1133,6 +1142,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); @@ -1465,6 +1475,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); @@ -1730,6 +1741,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1809,6 +1823,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); @@ -32689,6 +32704,24 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget, return Op; } +static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + SDValue Operand = N->getOperand(0); + EVT VT = Operand.getValueType(); + SDLoc dl(N); + + SDValue One = DAG.getConstantFP(1.0, dl, VT); + + // TODO: Fix Crash for bf16 when generating strict_fmul as it + // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0, + // ConstantFP:bf16, t5 LLVM ERROR: Do not know how to soft + // promote this operator's result! + SDValue Chain = DAG.getEntryNode(); + SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, + {Chain, Operand, One}); + return StrictFmul; +} + static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, unsigned OpNo) { const APInt Operand(32, OpNo); @@ -32828,6 +32861,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG); case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 09ffc2d184f18b..01642b0677aba3 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1052,6 +1052,13 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped) { auto &Ctx = CI->getContext(); IRBuilder<> B(Ctx); + // We want these instructions to be recognized as inlined instructions for the + // compare call, but we don't have a source location for the definition of + // that function, since we're generating that code now. Because the generated + // code is a viable point for a memory access error, we make the pragmatic + // choice here to directly use CI's location so that we have useful + // attribution for the generated code. + B.SetCurrentDebugLocation(CI->getDebugLoc()); BasicBlock *BBCI = CI->getParent(); BasicBlock *BBTail = diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 9c623c1250b302..6a3d07dbc00980 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3536,9 +3536,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { Pred = ICmpInst::getSwappedPredicate(Pred); std::swap(LHS, RHS); } - - Intrinsic::ID IID = - ICmpInst::isSigned(Pred) ? Intrinsic::scmp : Intrinsic::ucmp; + bool IsSigned = ICmpInst::isSigned(Pred); bool Replace = false; ICmpInst::Predicate ExtendedCmpPredicate; @@ -3560,6 +3558,32 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { ICmpInst::getSwappedPredicate(ExtendedCmpPredicate) == Pred)) Replace = true; + // (x == y) ? 0 : (x > y ? 1 : -1) + ICmpInst::Predicate FalseBranchSelectPredicate; + const APInt *InnerTV, *InnerFV; + if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero()) && + match(FV, m_Select(m_c_ICmp(FalseBranchSelectPredicate, m_Specific(LHS), + m_Specific(RHS)), + m_APInt(InnerTV), m_APInt(InnerFV)))) { + if (!ICmpInst::isGT(FalseBranchSelectPredicate)) { + FalseBranchSelectPredicate = + ICmpInst::getSwappedPredicate(FalseBranchSelectPredicate); + std::swap(LHS, RHS); + } + + if (!InnerTV->isOne()) { + std::swap(InnerTV, InnerFV); + std::swap(LHS, RHS); + } + + if (ICmpInst::isGT(FalseBranchSelectPredicate) && InnerTV->isOne() && + InnerFV->isAllOnes()) { + IsSigned = ICmpInst::isSigned(FalseBranchSelectPredicate); + Replace = true; + } + } + + Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp; if (Replace) return replaceInstUsesWith( SI, Builder.CreateIntrinsic(SI.getType(), IID, {LHS, RHS})); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index f6a0f5880cd5c7..5740285675eba8 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1826,11 +1826,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { // If the only use of phi is comparing it with a constant then we can // put this comparison in the incoming BB directly after a ucmp/scmp call // because we know that it will simplify to a single icmp. - // NOTE: the single-use check here is not only to ensure that the - // optimization is profitable, but also to avoid creating a potentially - // invalid phi node when we have a multi-edge in the CFG. const APInt *Ignored; - if (isa(InVal) && InVal->hasOneUse() && + if (isa(InVal) && InVal->hasOneUser() && match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) { OpsToMoveUseToIncomingBB.push_back(i); NewPhiValues.push_back(nullptr); @@ -1868,18 +1865,24 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { // Clone the instruction that uses the phi node and move it into the incoming // BB because we know that the next iteration of InstCombine will simplify it. + SmallDenseMap Clones; for (auto OpIndex : OpsToMoveUseToIncomingBB) { Value *Op = PN->getIncomingValue(OpIndex); BasicBlock *OpBB = PN->getIncomingBlock(OpIndex); - Instruction *Clone = I.clone(); - for (Use &U : Clone->operands()) { - if (U == PN) - U = Op; - else - U = U->DoPHITranslation(PN->getParent(), OpBB); + Instruction *Clone = Clones.lookup(OpBB); + if (!Clone) { + Clone = I.clone(); + for (Use &U : Clone->operands()) { + if (U == PN) + U = Op; + else + U = U->DoPHITranslation(PN->getParent(), OpBB); + } + Clone = InsertNewInstBefore(Clone, OpBB->getTerminator()->getIterator()); + Clones.insert({OpBB, Clone}); } - Clone = InsertNewInstBefore(Clone, OpBB->getTerminator()->getIterator()); + NewPhiValues[OpIndex] = Clone; } diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 3c3cc2599aee2f..577647cac3f58c 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1077,17 +1077,16 @@ void DFSanFunction::addReachesFunctionCallbacksIfEnabled(IRBuilder<> &IRB, if (dbgloc.get() == nullptr) { CILine = llvm::ConstantInt::get(I.getContext(), llvm::APInt(32, 0)); - FilePathPtr = IRB.CreateGlobalStringPtr( + FilePathPtr = IRB.CreateGlobalString( I.getFunction()->getParent()->getSourceFileName()); } else { CILine = llvm::ConstantInt::get(I.getContext(), llvm::APInt(32, dbgloc.getLine())); - FilePathPtr = - IRB.CreateGlobalStringPtr(dbgloc->getFilename()); + FilePathPtr = IRB.CreateGlobalString(dbgloc->getFilename()); } llvm::Value *FunctionNamePtr = - IRB.CreateGlobalStringPtr(I.getFunction()->getName()); + IRB.CreateGlobalString(I.getFunction()->getName()); CallInst *CB; std::vector args; @@ -1293,7 +1292,7 @@ void DataFlowSanitizer::buildExternWeakCheckIfNeeded(IRBuilder<> &IRB, if (GlobalValue::isExternalWeakLinkage(F->getLinkage())) { std::vector Args; Args.push_back(F); - Args.push_back(IRB.CreateGlobalStringPtr(F->getName())); + Args.push_back(IRB.CreateGlobalString(F->getName())); IRB.CreateCall(DFSanWrapperExternWeakNullFn, Args); } } @@ -1313,8 +1312,7 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, if (F->isVarArg()) { NewF->removeFnAttr("split-stack"); CallInst::Create(DFSanVarargWrapperFn, - IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "", - BB); + IRBuilder<>(BB).CreateGlobalString(F->getName()), "", BB); new UnreachableInst(*Ctx, BB); } else { auto ArgIt = pointer_iterator(NewF->arg_begin()); @@ -3086,7 +3084,7 @@ bool DFSanVisitor::visitWrappedCallBase(Function &F, CallBase &CB) { case DataFlowSanitizer::WK_Warning: CB.setCalledFunction(&F); IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn, - IRB.CreateGlobalStringPtr(F.getName())); + IRB.CreateGlobalString(F.getName())); DFSF.DFS.buildExternWeakCheckIfNeeded(IRB, &F); DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB)); DFSF.setOrigin(&CB, DFSF.DFS.ZeroOrigin); diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 694b2e6af718b7..a409f6150a71c1 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -1126,7 +1126,7 @@ Function *GCOVProfiler::insertCounterWriteout( uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; auto *StartFileCallArgs = ConstantStruct::get( StartFileCallArgsTy, - {Builder.CreateGlobalStringPtr(FilenameGcda), + {Builder.CreateGlobalString(FilenameGcda), Builder.getInt32(endian::read32be(Options.Version)), Builder.getInt32(CfgChecksum)}); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index aa31e6c0c444ac..1f2c9389c008bd 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6528,9 +6528,10 @@ SwitchLookupTable::SwitchLookupTable( if (LinearMappingPossible) { LinearOffset = cast(TableContents[0]); LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev); - bool MayWrap = false; APInt M = LinearMultiplier->getValue(); - (void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap); + bool MayWrap = true; + if (isIntN(M.getBitWidth(), TableSize - 1)) + (void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap); LinearMapValWrapped = NonMonotonic || MayWrap; Kind = LinearMapKind; ++NumLinearMaps; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index b767372a56b914..e695902c9d72ad 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1473,13 +1473,13 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // Keep a record of all the exiting blocks. SmallVector Predicates; - for (BasicBlock *BB1 : ExitingBlocks) { + for (BasicBlock *BB : ExitingBlocks) { const SCEV *EC = - PSE.getSE()->getPredicatedExitCount(TheLoop, BB1, &Predicates); + PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates); if (isa(EC)) { - UncountableExitingBlocks.push_back(BB1); + UncountableExitingBlocks.push_back(BB); - SmallVector Succs(successors(BB1)); + SmallVector Succs(successors(BB)); if (Succs.size() != 2) { reportVectorizationFailure( "Early exiting block does not have exactly two successors", @@ -1488,17 +1488,21 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { return false; } - BasicBlock *BB2; + BasicBlock *ExitBlock; if (!TheLoop->contains(Succs[0])) - BB2 = Succs[0]; + ExitBlock = Succs[0]; else { assert(!TheLoop->contains(Succs[1])); - BB2 = Succs[1]; + ExitBlock = Succs[1]; } - UncountableExitBlocks.push_back(BB2); + UncountableExitBlocks.push_back(ExitBlock); } else - CountableExitingBlocks.push_back(BB1); + CountableExitingBlocks.push_back(BB); } + // We can safely ignore the predicates here because when vectorizing the loop + // the PredicatatedScalarEvolution class will keep track of all predicates + // for each exiting block anyway. This happens when calling + // PSE.getSymbolicMaxBackedgeTakenCount() below. Predicates.clear(); // We only support one uncountable early exit. @@ -1513,13 +1517,25 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // The only supported early exit loops so far are ones where the early // exiting block is a unique predecessor of the latch block. BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor(); - if (LatchPredBB != getSpeculativeEarlyExitingBlock()) { + if (LatchPredBB != getUncountableEarlyExitingBlock()) { reportVectorizationFailure("Early exit is not the latch predecessor", "Cannot vectorize early exit loop", "EarlyExitNotLatchPredecessor", ORE, TheLoop); return false; } + // The latch block must have a countable exit. + if (isa( + PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) { + reportVectorizationFailure( + "Cannot determine exact exit count for latch block", + "Cannot vectorize early exit loop", + "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop); + return false; + } + assert(llvm::is_contained(CountableExitingBlocks, LatchBB) && + "Latch block not found in list of countable exits!"); + // Check to see if there are instructions that could potentially generate // exceptions or have side-effects. auto IsSafeOperation = [](Instruction *I) -> bool { @@ -1555,18 +1571,8 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } } - // The latch block must have a countable exit. - if (isa( - PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) { - reportVectorizationFailure( - "Cannot determine exact exit count for latch block", - "Cannot vectorize early exit loop", - "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop); - return false; - } - // The vectoriser cannot handle loads that occur after the early exit block. - assert(LatchBB->getUniquePredecessor() == getSpeculativeEarlyExitingBlock() && + assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() && "Expected latch predecessor to be the early exiting block"); // TODO: Handle loops that may fault. @@ -1580,16 +1586,15 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { return false; } - LLVM_DEBUG( - dbgs() - << "LV: Found an early exit. Retrying with speculative exit count.\n"); - [[maybe_unused]] const SCEV *SpecExitCount = + [[maybe_unused]] const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); - assert(!isa(SpecExitCount) && + // Since we have an exact exit count for the latch and the early exit + // dominates the latch, then this should guarantee a computed SCEV value. + assert(!isa(SymbolicMaxBTC) && "Failed to get symbolic expression for backedge taken count"); - - LLVM_DEBUG(dbgs() << "LV: Found speculative backedge taken count: " - << *SpecExitCount << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max " + "backedge taken count: " + << *SymbolicMaxBTC << '\n'); return true; } @@ -1653,7 +1658,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } - HasSpeculativeEarlyExit = false; + HasUncountableEarlyExit = false; if (isa(PSE.getBackedgeTakenCount())) { if (!isVectorizableEarlyExitLoop()) { if (DoExtraAnalysis) @@ -1661,7 +1666,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { else return false; } else - HasSpeculativeEarlyExit = true; + HasUncountableEarlyExit = true; } // Go over each instruction and look at memory deps. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0dc61cd500e327..0566d80c1cc001 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5207,7 +5207,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { const auto &TTICapture = TTI; auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { - if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || + (VF.isScalable() && + !TTICapture.isElementTypeLegalForScalableVector(Ty))) return 0; return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; @@ -9792,11 +9794,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasSpeculativeEarlyExit()) { - reportVectorizationFailure( - "Auto-vectorization of early exit loops is not yet supported.", - "Auto-vectorization of early exit loops is not yet supported.", - "EarlyExitLoopsUnsupported", ORE, L); + if (LVL.hasUncountableEarlyExit()) { + reportVectorizationFailure("Auto-vectorization of loops with uncountable " + "early exit is not yet supported", + "Auto-vectorization of loops with uncountable " + "early exit is not yet supported", + "UncountableEarlyExitLoopsUnsupported", ORE, L); return false; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a88702b81096ed..04b8fc09a724dd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18668,6 +18668,14 @@ class HorizontalReduction { // Vectorize a tree. Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, InsertPt); + // Update TrackedToOrig mapping, since the tracked values might be + // updated. + for (Value *RdxVal : Candidates) { + Value *OrigVal = TrackedToOrig.at(RdxVal); + Value *TransformedRdxVal = TrackedVals.at(OrigVal); + if (TransformedRdxVal != RdxVal) + TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal); + } Builder.SetInsertPoint(InsertPt); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 3fcfa4c9840e7c..ce15b2783cc457 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -233,13 +233,12 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { return Def->getLiveInIRValue(); if (hasScalarValue(Def, Instance)) { - return Data - .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)]; + return Data.VPV2Scalars[Def][Instance.Lane.mapToCacheIndex(VF)]; } if (!Instance.Lane.isFirstLane() && vputils::isUniformAfterVectorization(Def) && hasScalarValue(Def, {Instance.Part, VPLane::getFirstLane()})) { - return Data.PerPartScalars[Def][Instance.Part][0]; + return Data.VPV2Scalars[Def][0]; } assert(hasVectorValue(Def)); @@ -260,7 +259,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def) || !vputils::onlyFirstLaneUsed(Def) || (hasScalarValue(Def, VPIteration(0, 0)) && - Data.PerPartScalars[Def][0].size() == 1)) && + Data.VPV2Scalars[Def].size() == 1)) && "Trying to access a single scalar per part but has multiple scalars " "per part."); return get(Def, VPIteration(0, 0)); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e0a5b97540d400..0632495bc511cd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -267,8 +267,7 @@ struct VPTransformState { // vector value in the map. DenseMap VPV2Vector; - using ScalarsPerPartValuesTy = SmallVector, 2>; - DenseMap PerPartScalars; + DenseMap> VPV2Scalars; } Data; /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar @@ -281,13 +280,11 @@ struct VPTransformState { bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); } bool hasScalarValue(VPValue *Def, VPIteration Instance) { - auto I = Data.PerPartScalars.find(Def); - if (I == Data.PerPartScalars.end()) + auto I = Data.VPV2Scalars.find(Def); + if (I == Data.VPV2Scalars.end()) return false; unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - return Instance.Part < I->second.size() && - CacheIdx < I->second[Instance.Part].size() && - I->second[Instance.Part][CacheIdx]; + return CacheIdx < I->second.size() && I->second[CacheIdx]; } /// Set the generated vector Value for a given VPValue, if \p @@ -310,11 +307,8 @@ struct VPTransformState { /// Set the generated scalar \p V for \p Def and the given \p Instance. void set(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.insert({Def, {}}); - auto &PerPartVec = Iter.first->second; - if (PerPartVec.size() <= Instance.Part) - PerPartVec.resize(Instance.Part + 1); - auto &Scalars = PerPartVec[Instance.Part]; + auto Iter = Data.VPV2Scalars.insert({Def, {}}); + auto &Scalars = Iter.first->second; unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); if (Scalars.size() <= CacheIdx) Scalars.resize(CacheIdx + 1); @@ -324,15 +318,13 @@ struct VPTransformState { /// Reset an existing scalar value for \p Def and a given \p Instance. void reset(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.find(Def); - assert(Iter != Data.PerPartScalars.end() && - "need to overwrite existing value"); - assert(Instance.Part < Iter->second.size() && + auto Iter = Data.VPV2Scalars.find(Def); + assert(Iter != Data.VPV2Scalars.end() && "need to overwrite existing value"); unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - assert(CacheIdx < Iter->second[Instance.Part].size() && + assert(CacheIdx < Iter->second.size() && "need to overwrite existing value"); - Iter->second[Instance.Part][CacheIdx] = V; + Iter->second[CacheIdx] = V; } /// Add additional metadata to \p To that was not present on \p Orig. diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll index ff2609b5cc4e0a..b96fdb0109829b 100644 --- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll @@ -1,13 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN ; Check that we don't crash querying costs when vectors are not enabled. ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 define void @fadd() { ; CHECK-LABEL: 'fadd' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fadd undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef @@ -28,9 +39,22 @@ define void @fadd() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = fadd bfloat undef, undef %F32 = fadd float undef, undef %F64 = fadd double undef, undef + %V1BF16 = fadd <1 x bfloat> undef, undef + %V2BF16 = fadd <2 x bfloat> undef, undef + %V4BF16 = fadd <4 x bfloat> undef, undef + %V8BF16 = fadd <8 x bfloat> undef, undef + %V16BF16 = fadd <16 x bfloat> undef, undef + + %NXV1BF16 = fadd undef, undef + %NXV2BF16 = fadd undef, undef + %NXV4BF16 = fadd undef, undef + %NXV8BF16 = fadd undef, undef + %NXV16BF16 = fadd undef, undef + %V1F32 = fadd <1 x float> undef, undef %V2F32 = fadd <2 x float> undef, undef %V4F32 = fadd <4 x float> undef, undef @@ -94,8 +118,19 @@ define void @fadd_f16() { define void @fsub() { ; CHECK-LABEL: 'fsub' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fsub undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef @@ -116,9 +151,22 @@ define void @fsub() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %F16 = fsub half undef, undef %F32 = fsub float undef, undef %F64 = fsub double undef, undef + %V1BF16 = fsub <1 x bfloat> undef, undef + %V2BF16 = fsub <2 x bfloat> undef, undef + %V4BF16 = fsub <4 x bfloat> undef, undef + %V8BF16 = fsub <8 x bfloat> undef, undef + %V16BF16 = fsub <16 x bfloat> undef, undef + + %NXV1BF16 = fsub undef, undef + %NXV2BF16 = fsub undef, undef + %NXV4BF16 = fsub undef, undef + %NXV8BF16 = fsub undef, undef + %NXV16BF16 = fsub undef, undef + %V1F32 = fsub <1 x float> undef, undef %V2F32 = fsub <2 x float> undef, undef %V4F32 = fsub <4 x float> undef, undef @@ -182,8 +230,19 @@ define void @fsub_f16() { define void @fmul() { ; CHECK-LABEL: 'fmul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fmul undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef @@ -204,9 +263,22 @@ define void @fmul() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = fmul bfloat undef, undef %F32 = fmul float undef, undef %F64 = fmul double undef, undef + %V1BF16 = fmul <1 x bfloat> undef, undef + %V2BF16 = fmul <2 x bfloat> undef, undef + %V4BF16 = fmul <4 x bfloat> undef, undef + %V8BF16 = fmul <8 x bfloat> undef, undef + %V16BF16 = fmul <16 x bfloat> undef, undef + + %NXV1BF16 = fmul undef, undef + %NXV2BF16 = fmul undef, undef + %NXV4BF16 = fmul undef, undef + %NXV8BF16 = fmul undef, undef + %NXV16BF16 = fmul undef, undef + %V1F32 = fmul <1 x float> undef, undef %V2F32 = fmul <2 x float> undef, undef %V4F32 = fmul <4 x float> undef, undef @@ -270,8 +342,19 @@ define void @fmul_f16() { define void @fdiv() { ; CHECK-LABEL: 'fdiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fdiv undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef @@ -292,9 +375,22 @@ define void @fdiv() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = fdiv bfloat undef, undef %F32 = fdiv float undef, undef %F64 = fdiv double undef, undef + %V1BF16 = fdiv <1 x bfloat> undef, undef + %V2BF16 = fdiv <2 x bfloat> undef, undef + %V4BF16 = fdiv <4 x bfloat> undef, undef + %V8BF16 = fdiv <8 x bfloat> undef, undef + %V16BF16 = fdiv <16 x bfloat> undef, undef + + %NXV1BF16 = fdiv undef, undef + %NXV2BF16 = fdiv undef, undef + %NXV4BF16 = fdiv undef, undef + %NXV8BF16 = fdiv undef, undef + %NXV16BF16 = fdiv undef, undef + %V1F32 = fdiv <1 x float> undef, undef %V2F32 = fdiv <2 x float> undef, undef %V4F32 = fdiv <4 x float> undef, undef @@ -358,8 +454,19 @@ define void @fdiv_f16() { define void @frem() { ; CHECK-LABEL: 'frem' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV16BF16 = frem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef @@ -380,9 +487,22 @@ define void @frem() { ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8F64 = frem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = frem bfloat undef, undef %F32 = frem float undef, undef %F64 = frem double undef, undef + %V1BF16 = frem <1 x bfloat> undef, undef + %V2BF16 = frem <2 x bfloat> undef, undef + %V4BF16 = frem <4 x bfloat> undef, undef + %V8BF16 = frem <8 x bfloat> undef, undef + %V16BF16 = frem <16 x bfloat> undef, undef + + %NXV1BF16 = frem undef, undef + %NXV2BF16 = frem undef, undef + %NXV4BF16 = frem undef, undef + %NXV8BF16 = frem undef, undef + %NXV16BF16 = frem undef, undef + %V1F32 = frem <1 x float> undef, undef %V2F32 = frem <2 x float> undef, undef %V4F32 = frem <4 x float> undef, undef @@ -462,8 +582,19 @@ define void @frem_f16() { define void @fneg() { ; CHECK-LABEL: 'fneg' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fneg half undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fneg <1 x bfloat> undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fneg <2 x bfloat> undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fneg <4 x bfloat> undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fneg <8 x bfloat> undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fneg <16 x bfloat> undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fneg undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fneg undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fneg undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fneg undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fneg undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef @@ -484,9 +615,22 @@ define void @fneg() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = fneg half undef %F32 = fneg float undef %F64 = fneg double undef + %V1BF16 = fneg <1 x bfloat> undef + %V2BF16 = fneg <2 x bfloat> undef + %V4BF16 = fneg <4 x bfloat> undef + %V8BF16 = fneg <8 x bfloat> undef + %V16BF16 = fneg <16 x bfloat> undef + + %NXV1BF16 = fneg undef + %NXV2BF16 = fneg undef + %NXV4BF16 = fneg undef + %NXV8BF16 = fneg undef + %NXV16BF16 = fneg undef + %V1F32 = fneg <1 x float> undef %V2F32 = fneg <2 x float> undef %V4F32 = fneg <4 x float> undef @@ -550,8 +694,19 @@ define void @fneg_f16() { define void @fcopysign() { ; CHECK-LABEL: 'fcopysign' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1BF16 = call @llvm.copysign.nxv1bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV2BF16 = call @llvm.copysign.nxv2bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV4BF16 = call @llvm.copysign.nxv4bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8BF16 = call @llvm.copysign.nxv8bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16BF16 = call @llvm.copysign.nxv16bf16( undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef) @@ -572,9 +727,22 @@ define void @fcopysign() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F64 = call @llvm.copysign.nxv8f64( undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef) %F32 = call float @llvm.copysign.f32(float undef, float undef) %F64 = call double @llvm.copysign.f64(double undef, double undef) + %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef) + %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef) + %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) + %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) + %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef) + + %NXV1BF16 = call @llvm.copysign.nxv1bf16( undef, undef) + %NXV2BF16 = call @llvm.copysign.nxv2bf16( undef, undef) + %NXV4BF16 = call @llvm.copysign.nxv4bf16( undef, undef) + %NXV8BF16 = call @llvm.copysign.nxv8bf16( undef, undef) + %NXV16BF16 = call @llvm.copysign.nxv16bf16( undef, undef) + %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef) %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef) %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef) @@ -654,8 +822,19 @@ define void @fcopysign_f16() { define void @fma() { ; CHECK-LABEL: 'fma' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call @llvm.fma.nxv1bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call @llvm.fma.nxv2bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call @llvm.fma.nxv4bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8BF16 = call @llvm.fma.nxv8bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV16BF16 = call @llvm.fma.nxv16bf16( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) @@ -676,9 +855,22 @@ define void @fma() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F64 = call @llvm.fma.nxv8f64( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) + %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) + %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) + %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) + %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) + %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) + + %NXV1BF16 = call @llvm.fma.nxv1bf16( undef, undef, undef) + %NXV2BF16 = call @llvm.fma.nxv2bf16( undef, undef, undef) + %NXV4BF16 = call @llvm.fma.nxv4bf16( undef, undef, undef) + %NXV8BF16 = call @llvm.fma.nxv8bf16( undef, undef, undef) + %NXV16BF16 = call @llvm.fma.nxv16bf16( undef, undef, undef) + %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef) %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) @@ -758,30 +950,45 @@ define void @fma_f16() { define void @fmuladd() { ; CHECK-LABEL: 'fmuladd' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call float @llvm.fmuladd.f32(float undef, float undef, float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call double @llvm.fmuladd.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call @llvm.fmuladd.nxv1f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call @llvm.fmuladd.nxv2f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call @llvm.fmuladd.nxv4f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call @llvm.fmuladd.nxv8f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = call @llvm.fmuladd.nxv16f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call @llvm.fmuladd.nxv1f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call @llvm.fmuladd.nxv2f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call @llvm.fmuladd.nxv4f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call @llvm.fmuladd.nxv8f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = call @llvm.fmuladd.nxv16f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call float @llvm.fmuladd.f32(float undef, float undef, float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call double @llvm.fmuladd.f64(double undef, double undef, double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call @llvm.fmuladd.nxv1bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call @llvm.fmuladd.nxv2bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call @llvm.fmuladd.nxv4bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call @llvm.fmuladd.nxv8bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = call @llvm.fmuladd.nxv16bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = call @llvm.fmuladd.nxv1f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call @llvm.fmuladd.nxv2f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = call @llvm.fmuladd.nxv4f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = call @llvm.fmuladd.nxv8f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %25 = call @llvm.fmuladd.nxv16f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call @llvm.fmuladd.nxv1f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call @llvm.fmuladd.nxv2f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call @llvm.fmuladd.nxv4f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = call @llvm.fmuladd.nxv8f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call @llvm.fmuladd.nxv16f64( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef) call float @llvm.fmuladd.f32(float undef, float undef, float undef) call double @llvm.fmuladd.f64(double undef, double undef, double undef) + call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) + call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) + call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) + call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) @@ -790,6 +997,11 @@ define void @fmuladd() { call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef) + call @llvm.fmuladd.nxv1bf16( undef, undef, undef) + call @llvm.fmuladd.nxv2bf16( undef, undef, undef) + call @llvm.fmuladd.nxv4bf16( undef, undef, undef) + call @llvm.fmuladd.nxv8bf16( undef, undef, undef) + call @llvm.fmuladd.nxv16bf16( undef, undef, undef) call @llvm.fmuladd.nxv1f32( undef, undef, undef) call @llvm.fmuladd.nxv2f32( undef, undef, undef) call @llvm.fmuladd.nxv4f32( undef, undef, undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 807657797288da..bb98508f239c1b 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -1029,10 +1029,10 @@ define void @store() { define void @strided_load() { ; CHECK-LABEL: 'strided_load' -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) @@ -1056,10 +1056,10 @@ define void @strided_load() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'strided_load' -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 1c11e1221fef34..7e4a4d527fc903 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -26,6 +26,11 @@ declare i16 @llvm.nvvm.min.us(i16, i16) declare i32 @llvm.nvvm.min.ui(i32, i32) declare i64 @llvm.nvvm.min.ull(i64, i64) +declare i32 @llvm.nvvm.bitcast.f2i(float) +declare float @llvm.nvvm.bitcast.i2f(i32) +declare i64 @llvm.nvvm.bitcast.d2ll(double) +declare double @llvm.nvvm.bitcast.ll2d(i64) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -120,3 +125,18 @@ define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) { ret void } + +; CHECK-LABEL: @bitcast +define void @bitcast(i32 %a, i64 %b, float %c, double %d) { +; CHECK: bitcast float %c to i32 +; CHECK: bitcast i32 %a to float +; CHECK: bitcast double %d to i64 +; CHECK: bitcast i64 %b to double +; + %r1 = call i32 @llvm.nvvm.bitcast.f2i(float %c) + %r2 = call float @llvm.nvvm.bitcast.i2f(i32 %a) + %r3 = call i64 @llvm.nvvm.bitcast.d2ll(double %d) + %r4 = call double @llvm.nvvm.bitcast.ll2d(i64 %b) + + ret void +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir index 3e768c4d7a267c..03c28efe7e09fb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir @@ -159,25 +159,13 @@ body: | ; CHECK-LABEL: name: test_freeze_v3s8 ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16) - ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16) - ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s16>) = G_FREEZE [[UV4]] - ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[FREEZE]](<4 x s16>) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>) ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CHECK-NEXT: %ext0:_(s32) = G_AND [[ANYEXT1]], [[C]] - ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; CHECK-NEXT: %ext1:_(s32) = G_AND [[ANYEXT2]], [[C]] - ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; CHECK-NEXT: %ext2:_(s32) = G_AND [[ANYEXT3]], [[C]] + ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8) + ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8) + ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8) ; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32) ; CHECK-NEXT: $q0 = COPY %res(<4 x s32>) %x:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir index 9a8697c1d9b866..11c6c7fb40faa1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir @@ -248,13 +248,10 @@ body: | ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16) ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16) - ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16) - ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV8]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>) ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>) diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index e90014be21deb3..b14f1a43b7dcfd 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -177,9 +177,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){ ; ; CHECK-GI-LABEL: bswap_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: rev16 v0.8b, v0.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 18570b2d793ff6..eee917e8acb0d7 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -183,15 +183,12 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v8s16_v2s16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h1, [x0] -; CHECK-GI-NEXT: ldr h2, [x0, #2] -; CHECK-GI-NEXT: dup v0.4s, w8 -; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: xtn v2.4h, v0.4s -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v0.s[0], w8 -; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: mov v0.s[3], w8 @@ -209,10 +206,7 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v16s8_v4s8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.8h, w8 -; CHECK-GI-NEXT: xtn v1.8b, v0.8h ; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: mov v0.s[3], w8 diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index aa20304e52a951..a9618fdc2dec30 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -3,24 +3,10 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { -; CHECK-SD-LABEL: interleave2_v4f16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: interleave2_v4f16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v2.4s, w8 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: xtn v0.4h, v2.4s -; CHECK-GI-NEXT: mov v1.s[0], w8 -; CHECK-GI-NEXT: mov v2.s[0], w9 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: interleave2_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) ret <4 x half> %retval } diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index 20b5567e973d09..f72a49f6ab7c89 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -3172,42 +3172,22 @@ entry: } define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) { -; CHECK-SD-LABEL: fptos_v3f32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptos_v3f32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptos_v3f32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptosi <3 x float> %a to <3 x i16> ret <3 x i16> %c } define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) { -; CHECK-SD-LABEL: fptou_v3f32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptou_v3f32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptou_v3f32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptoui <3 x float> %a to <3 x i16> ret <3 x i16> %c @@ -6077,11 +6057,7 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16: @@ -6110,11 +6086,7 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16: diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 4ac04798e15481..f70ec0f35cb586 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -7450,9 +7450,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -7493,9 +7491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -8059,8 +8055,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll index 7014a4a9acbe03..54f7887aee8d3e 100644 --- a/llvm/test/CodeGen/AArch64/shift.ll +++ b/llvm/test/CodeGen/AArch64/shift.ll @@ -531,26 +531,8 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: shl_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v1.h[1] -; CHECK-GI-NEXT: mov h4, v0.h[2] -; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov h3, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: mov v1.b[1], w9 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: fmov w9, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: fmov w8, s5 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov v0.b[3], w8 -; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] ; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] @@ -592,12 +574,8 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: shl_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] @@ -741,26 +719,8 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: ashr_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h3, v0.h[1] -; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[3] -; CHECK-GI-NEXT: fmov w9, s4 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov h3, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v1.b[3], w8 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] @@ -802,12 +762,8 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: ashr_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v1.s[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] @@ -946,26 +902,8 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: lshr_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h3, v0.h[1] -; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[3] -; CHECK-GI-NEXT: fmov w9, s4 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov h3, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v1.b[3], w8 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] @@ -1006,12 +944,8 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: lshr_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v1.s[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 954458e4459749..5f4ff1e64673bb 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -209,27 +209,9 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ ; ; CHECK-GI-LABEL: shufflevector_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v1.h[1] -; CHECK-GI-NEXT: mov h4, v0.h[2] -; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov h3, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: mov v1.b[1], w9 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: fmov w9, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: fmov w8, s5 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov v0.b[3], w8 -; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -284,13 +266,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; ; CHECK-GI-LABEL: shufflevector_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -403,16 +381,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ ; ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov h1, v0.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: dup v0.8b, v0.b[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret @@ -448,9 +417,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: dup v0.4h, v0.h[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index ead790203f9496..fb3f8ebd7d1413 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -294,19 +294,10 @@ entry: } define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) { -; CHECK-SD-LABEL: xtn_v3i32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: xtn_v3i32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: xtn_v3i32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %arg1 = trunc <3 x i32> %a to <3 x i16> ret <3 x i16> %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index b061d53de5d3c5..39a3b1c8adc9f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -2,11 +2,118 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 -declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0 -declare double @llvm.amdgcn.readfirstlane.f64(double) #0 +define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_inreg: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: s_and_b32 s4, s6, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_inreg: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: s_and_b32 s4, s6, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_select: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 +; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-SDAG-NEXT: s_bitcmp1_b32 s4, 0 +; CHECK-SDAG-NEXT: s_cselect_b64 vcc, -1, 0 +; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_select: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 +; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-GISEL-NEXT: s_and_b32 s4, 1, s4 +; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp ugt i32 %src, 42 + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %cmp) + %sel = select i1 %readfirstlane, i32 %src, i32 %src1 + store i32 %sel, ptr addrspace(1) %out, align 4 + ret void +} -define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 { +define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_load: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: flat_load_ubyte v2, v[2:3] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_load: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: flat_load_ubyte v2, v[2:3] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %load = load i1, ptr addrspace(1) %in + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %load) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -29,7 +136,7 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 { ret void } -define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 { +define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -56,7 +163,7 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 { ret void } -define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 { +define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -83,7 +190,7 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 { ret void } -define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b32 s0, 32 @@ -104,7 +211,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32 @@ -125,7 +232,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b32 s0, 0 @@ -148,7 +255,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -173,7 +280,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -201,7 +308,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -230,7 +337,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -262,7 +369,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -294,7 +401,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -328,7 +435,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -362,7 +469,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 @@ -593,6 +700,3 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { call void asm sideeffect "; use $0", "s"(<8 x i16> %x) ret void } - -attributes #0 = { nounwind readnone convergent } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/Generic/allow-check.ll b/llvm/test/CodeGen/Generic/allow-check.ll index a08488959862ab..148ee811ea806c 100644 --- a/llvm/test/CodeGen/Generic/allow-check.ll +++ b/llvm/test/CodeGen/Generic/allow-check.ll @@ -1,5 +1,5 @@ ; Avoid `!DL->isLittleEndian() && !CLI->enableBigEndian()` missmatch on PPC64BE. -; REQUIRES: host-byteorder-little-endian +; REQUIRES: target-byteorder-little-endian ; -global-isel=1 is unsupported. ; XFAIL: target=loongarch{{.*}} diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir index 58e2e644b000fe..a40b4d85773b29 100644 --- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir +++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir @@ -40,9 +40,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0 + %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32_avar 0, 4, 1, 0, 32, &test_param_1 + %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00 %3 = FADD_rnf64ri %1, double 3.250000e+00 %4 = CVT_f32_f64 %3, 5 @@ -66,9 +66,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32_avar 0, 4, 1, 2, 32, &test2_param_0 + %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32_avar 0, 4, 1, 0, 32, &test2_param_1 + %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000 %3 = FADD_rnf64ri %1, double 0x7FF8000000000000 %4 = CVT_f32_f64 %3, 5 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll index af3d4f50f3fe4f..8d548861f43936 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: sdiv_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: move $2, $4 +; ; GP32-LABEL: sdiv_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: move $2, $4 ; +; MIPS3-LABEL: sdiv_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: move $2, $4 +; ; GP64-LABEL: sdiv_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,15 @@ entry: } define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: sdiv_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: sll $1, $1, 24 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 24 +; ; GP32R0R2-LABEL: sdiv_i8: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: div $zero, $4, $5 @@ -94,6 +113,15 @@ define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seb $2, $1 ; +; MIPS3-LABEL: sdiv_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: sll $1, $1, 24 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 24 +; ; GP64R0R1-LABEL: sdiv_i8: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: div $zero, $4, $5 @@ -138,6 +166,15 @@ entry: } define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: sdiv_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: sll $1, $1, 16 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 16 +; ; GP32R0R2-LABEL: sdiv_i16: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: div $zero, $4, $5 @@ -162,6 +199,15 @@ define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seh $2, $1 ; +; MIPS3-LABEL: sdiv_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: sll $1, $1, 16 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 16 +; ; GP64R0R1-LABEL: sdiv_i16: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: div $zero, $4, $5 @@ -206,6 +252,14 @@ entry: } define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: sdiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: sdiv_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -219,6 +273,14 @@ define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: sdiv_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: sdiv_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -250,6 +312,22 @@ entry: } define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: sdiv_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__divdi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: sdiv_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -281,6 +359,14 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: sdiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: sdiv_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddiv $zero, $4, $5 @@ -332,6 +418,30 @@ entry: } define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: sdiv_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__divti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: sdiv_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -379,6 +489,25 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: sdiv_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(sdiv_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(sdiv_i128))) +; MIPS3-NEXT: ld $25, %call16(__divti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: sdiv_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll index 6349d5c64ab429..29cb34b8d970f1 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @srem_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: srem_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $2, $zero, 0 +; ; GP32-LABEL: srem_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @srem_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $2, $zero, 0 ; +; MIPS3-LABEL: srem_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: addiu $2, $zero, 0 +; ; GP64-LABEL: srem_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,14 @@ entry: } define signext i8 @srem_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: srem_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i8: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -83,6 +101,14 @@ define signext i8 @srem_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i8: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -114,6 +140,14 @@ entry: } define signext i16 @srem_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: srem_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i16: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -127,6 +161,14 @@ define signext i16 @srem_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i16: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -158,6 +200,14 @@ entry: } define signext i32 @srem_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: srem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -171,6 +221,14 @@ define signext i32 @srem_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -202,6 +260,22 @@ entry: } define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: srem_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__moddi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: srem_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -233,6 +307,14 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: srem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddiv $zero, $4, $5 @@ -284,6 +366,30 @@ entry: } define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: srem_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__modti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: srem_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -331,6 +437,25 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: srem_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(srem_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(srem_i128))) +; MIPS3-NEXT: ld $25, %call16(__modti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: srem_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll new file mode 100644 index 00000000000000..db2c660e9bc79b --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +define signext i32 @mult_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: mult_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: mult $4, $5 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: mult $1, $6 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: mult_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: mul $1, $4, $5 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mul $2, $1, $6 +; +entry: + %mul = mul nsw i32 %a, %b + %mul1 = mul nsw i32 %mul, %c + ret i32 %mul1 +} + +define signext i64 @mul_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: mul_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: dmult $4, $5 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: dmult $1, $6 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: mul_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: dmult $4, $5 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: dmult $1, $6 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %mul = mul i64 %a, %b + %mul1 = mul i64 %mul, %c + ret i64 %mul1 +} diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll new file mode 100644 index 00000000000000..4ec5ecc9e2f17d --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: sdiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: div $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: sdiv_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: div $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mflo $1 +; MIPS32-NEXT: div $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mflo $2 +; +entry: + %sdiv = sdiv i32 %a, %b + %sdiv1 = sdiv i32 %sdiv, %c + ret i32 %sdiv1 +} + +define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: sdiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddiv $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: sdiv_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddiv $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: ddiv $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %sdiv = sdiv i64 %a, %b + %sdiv1 = sdiv i64 %sdiv, %c + ret i64 %sdiv1 +} + +define signext i32 @sdiv_lw_sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: sdiv_lw_sdiv_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: sdiv_lw_sdiv_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %sdiv = sdiv i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %sdiv1 = sdiv i32 %sdiv, %2 + ret i32 %sdiv1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll new file mode 100644 index 00000000000000..4f729b015b2822 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: srem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: div $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: srem_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: div $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mfhi $1 +; MIPS32-NEXT: div $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mfhi $2 +; +entry: + %rem = srem i32 %a, %b + %rem1 = srem i32 %rem, %c + ret i32 %rem1 +} + +define signext i64 @srem_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: srem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddiv $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: srem_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddiv $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mfhi $1 +; MIPS64-NEXT: ddiv $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mfhi $2 +; +entry: + %rem = srem i64 %a, %b + %rem1 = srem i64 %rem, %c + ret i64 %rem1 +} + +define signext i32 @srem_lw_srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: srem_lw_srem_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: srem_lw_srem_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %rem = srem i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %rem1 = srem i32 %rem, %2 + ret i32 %rem1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll new file mode 100644 index 00000000000000..97ac0d8031cf55 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: udiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: divu $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: udiv_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: divu $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mflo $1 +; MIPS32-NEXT: divu $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mflo $2 +; +entry: + %udiv = udiv i32 %a, %b + %udiv1 = udiv i32 %udiv, %c + ret i32 %udiv1 +} + +define signext i64 @udiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: udiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddivu $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: udiv_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddivu $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: ddivu $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %udiv = udiv i64 %a, %b + %udiv1 = udiv i64 %udiv, %c + ret i64 %udiv1 +} + +define signext i32 @udiv_lw_udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: udiv_lw_udiv_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: udiv_lw_udiv_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %udiv = udiv i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %udiv1 = udiv i32 %udiv, %2 + ret i32 %udiv1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll new file mode 100644 index 00000000000000..e1819f1d57b7db --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: urem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: divu $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: urem_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: divu $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mfhi $1 +; MIPS32-NEXT: divu $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mfhi $2 +; +entry: + %urem = urem i32 %a, %b + %urem1 = urem i32 %urem, %c + ret i32 %urem1 +} + +define signext i64 @urem_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: urem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddivu $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: urem_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddivu $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mfhi $1 +; MIPS64-NEXT: ddivu $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mfhi $2 +; +entry: + %urem = urem i64 %a, %b + %urem1 = urem i64 %urem, %c + ret i64 %urem1 +} + +define signext i32 @urem_lw_urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: urem_lw_urem_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: urem_lw_urem_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %rem = urem i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %urem1 = urem i32 %rem, %2 + ret i32 %urem1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll index e3dd347e723bc8..cc2c6614e69c8f 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) { +; MIPS2-LABEL: udiv_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: move $2, $4 +; ; GP32-LABEL: udiv_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: move $2, $4 ; +; MIPS3-LABEL: udiv_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: move $2, $4 +; ; GP64-LABEL: udiv_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,14 @@ entry: } define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) { +; MIPS2-LABEL: udiv_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i8: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -83,6 +101,14 @@ define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i8: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -114,6 +140,14 @@ entry: } define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) { +; MIPS2-LABEL: udiv_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i16: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -127,6 +161,14 @@ define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i16: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -158,6 +200,14 @@ entry: } define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: udiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -171,6 +221,14 @@ define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -202,6 +260,22 @@ entry: } define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: udiv_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__udivdi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: udiv_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -233,6 +307,14 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: udiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddivu $zero, $4, $5 @@ -284,6 +366,30 @@ entry: } define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: udiv_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__udivti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: udiv_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -331,6 +437,25 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: udiv_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(udiv_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(udiv_i128))) +; MIPS3-NEXT: ld $25, %call16(__udivti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: udiv_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll index 4105d67da6f1ac..5da1f614b8f157 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @urem_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: urem_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $2, $zero, 0 +; ; GP32-LABEL: urem_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @urem_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $2, $zero, 0 ; +; MIPS3-LABEL: urem_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: addiu $2, $zero, 0 +; ; GP64-LABEL: urem_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,17 @@ entry: } define signext i8 @urem_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: urem_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: andi $1, $5, 255 +; MIPS2-NEXT: andi $2, $4, 255 +; MIPS2-NEXT: divu $zero, $2, $1 +; MIPS2-NEXT: teq $1, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: sll $1, $1, 24 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 24 +; ; GP32R0R2-LABEL: urem_i8: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: andi $1, $5, 255 @@ -100,6 +121,17 @@ define signext i8 @urem_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seb $2, $2 ; +; MIPS3-LABEL: urem_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: andi $1, $5, 255 +; MIPS3-NEXT: andi $2, $4, 255 +; MIPS3-NEXT: divu $zero, $2, $1 +; MIPS3-NEXT: teq $1, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: sll $1, $1, 24 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 24 +; ; GP64R0R1-LABEL: urem_i8: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: andi $1, $5, 255 @@ -154,6 +186,17 @@ entry: } define signext i16 @urem_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: urem_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: andi $1, $5, 65535 +; MIPS2-NEXT: andi $2, $4, 65535 +; MIPS2-NEXT: divu $zero, $2, $1 +; MIPS2-NEXT: teq $1, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: sll $1, $1, 16 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 16 +; ; GP32R0R2-LABEL: urem_i16: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: andi $1, $5, 65535 @@ -184,6 +227,17 @@ define signext i16 @urem_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seh $2, $2 ; +; MIPS3-LABEL: urem_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: andi $1, $5, 65535 +; MIPS3-NEXT: andi $2, $4, 65535 +; MIPS3-NEXT: divu $zero, $2, $1 +; MIPS3-NEXT: teq $1, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: sll $1, $1, 16 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 16 +; ; GP64R0R1-LABEL: urem_i16: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: andi $1, $5, 65535 @@ -238,6 +292,14 @@ entry: } define signext i32 @urem_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: urem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: urem_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -251,6 +313,14 @@ define signext i32 @urem_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: urem_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: urem_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -282,6 +352,22 @@ entry: } define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: urem_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__umoddi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: urem_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -313,6 +399,14 @@ define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: urem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: urem_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddivu $zero, $4, $5 @@ -364,6 +458,30 @@ entry: } define signext i128 @urem_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: urem_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__umodti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: urem_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -411,6 +529,25 @@ define signext i128 @urem_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: urem_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(urem_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(urem_i128))) +; MIPS3-NEXT: ld $25, %call16(__umodti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: urem_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll new file mode 100644 index 00000000000000..82eb5fb71677b6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: fence_sc_cluster +define void @fence_sc_cluster() local_unnamed_addr { + ; CHECK: fence.sc.cluster + fence syncscope("cluster") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_cluster +define void @fence_acq_rel_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_cluster +define void @fence_release_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") release + ret void +} + +; CHECK-LABEL: fence_acquire_cluster +define void @fence_acquire_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") acquire + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll index d3aace95e96650..626685f82f32ca 100644 --- a/llvm/test/CodeGen/NVPTX/fence.ll +++ b/llvm/test/CodeGen/NVPTX/fence.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; TODO: implement and test thread scope. + ; CHECK-LABEL: fence_sc_sys define void @fence_sc_sys() local_unnamed_addr { ; SM60: membar.sys @@ -16,21 +18,85 @@ define void @fence_acq_rel_sys() local_unnamed_addr { ; SM60: membar.sys ; SM70: fence.acq_rel.sys fence acq_rel - ret void + ret void } ; CHECK-LABEL: fence_release_sys define void @fence_release_sys() local_unnamed_addr { ; SM60: membar.sys - ; SM70: fence.acq_rel.sys + ; SM70: fence.acq_rel.sys fence release - ret void + ret void } ; CHECK-LABEL: fence_acquire_sys define void @fence_acquire_sys() local_unnamed_addr { ; SM60: membar.sys - ; SM70: fence.acq_rel.sys + ; SM70: fence.acq_rel.sys fence acquire - ret void + ret void +} + +; CHECK-LABEL: fence_sc_gpu +define void @fence_sc_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.sc.gpu + fence syncscope("device") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_gpu +define void @fence_acq_rel_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_gpu +define void @fence_release_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") release + ret void +} + +; CHECK-LABEL: fence_acquire_gpu +define void @fence_acquire_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") acquire + ret void +} + +; CHECK-LABEL: fence_sc_cta +define void @fence_sc_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.sc.cta + fence syncscope("block") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_cta +define void @fence_acq_rel_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_cta +define void @fence_release_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") release + ret void +} + +; CHECK-LABEL: fence_acquire_cta +define void @fence_acquire_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") acquire + ret void } \ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 9cea33d12027f2..4b200eacb0cf4a 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,10 +1,367 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" +; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile release": generates "st.release.sys;" +; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;" + +; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;" +; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;" +; but should generate "fence.sc.sys; st.relaxed.mmio.sys;" + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: implement and test thread scope. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . + ;; generic statespace -; CHECK-LABEL: generic_acq_rel -define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_gpu +define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1 + + ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2 + + ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8 + + ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_gpu +define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_cta +define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1 + + ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2 + + ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8 + + ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_cta +define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_gpu +define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_gpu +define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_cta +define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_cta +define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_sys +define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -31,7 +388,7 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e release, align 4 @@ -44,8 +401,8 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ret void } -; CHECK-LABEL: generic_acq_rel_volatile -define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_acq_rel_volatile_sys +define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -72,7 +429,7 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e release, align 4 @@ -85,8 +442,172 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo ret void } -; CHECK-LABEL: generic_sc -define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_acq_rel_gpu +define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_gpu +define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_cta +define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_cta +define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_sys +define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a seq_cst, align 1 @@ -122,7 +643,7 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e seq_cst, align 4 @@ -138,8 +659,8 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ret void } -; CHECK-LABEL: generic_sc_volatile -define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_sc_volatile_sys +define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 @@ -175,7 +696,7 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e seq_cst, align 4 @@ -191,393 +712,2338 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ret void } -;; global statespace - -; CHECK-LABEL: global_acq_rel -define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 +; CHECK-LABEL: generic_sc_gpu +define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a release, align 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2 + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b release, align 2 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4 + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c release, align 4 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8 + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d release, align 8 - - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e release, align 4 - - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e release, align 8 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8 ret void } -; CHECK-LABEL: global_acq_rel_volatile -define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 +; CHECK-LABEL: generic_sc_volatile_gpu +define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_cta +define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8 ret void } -; CHECK-LABEL: global_seq_cst -define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: generic_sc_volatile_cta +define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8 ret void } -; CHECK-LABEL: global_seq_cst_volatile -define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 +;; global statespace + +; CHECK-LABEL: global_unordered_gpu +define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 + ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 + ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 + ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 + ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 + ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 + ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 + ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 + ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 + ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 + ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 ret void } -;; shared statespace +; CHECK-LABEL: global_unordered_volatile_gpu +define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 -; CHECK-LABEL: shared_acq_rel -define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_cta +define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a release, align 1 + ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2 + ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b release, align 2 + ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4 + ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c release, align 4 + ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8 + ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d release, align 8 + ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e release, align 4 + ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8 + ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e release, align 8 + ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 ret void } -; CHECK-LABEL: shared_acq_rel_volatile -define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 +; CHECK-LABEL: global_unordered_volatile_cta +define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2 + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4 + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8 + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8 + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 ret void } -; CHECK-LABEL: shared_seq_cst -define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 +; CHECK-LABEL: global_monotonic_gpu +define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 + ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 + ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 + ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 + ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 + ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 + ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 + ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 + ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 + ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 ret void } -; CHECK-LABEL: shared_seq_cst_volatile -define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 +; CHECK-LABEL: global_monotonic_volatile_gpu +define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_cta +define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile_cta +define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_sys +define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_sys +define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_gpu +define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_gpu +define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_cta +define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_cta +define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_sys +define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_sys +define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_gpu +define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_gpu +define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_cta +define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_cta +define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + + ret void +} + +;; shared statespace + +; CHECK-LABEL: shared_unordered_gpu +define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_gpu +define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_cta +define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_cta +define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_gpu +define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_gpu +define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_cta +define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_cta +define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_sys +define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_sys +define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_gpu +define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_gpu +define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_cta +define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_cta +define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_sys +define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_sys +define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_gpu +define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_gpu +define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_cta +define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_cta +define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_unordered_gpu +define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_gpu +define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_cta +define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_cta +define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_gpu +define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_gpu +define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_cta +define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_cta +define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 - ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 ret void } -;; local statespace - -; CHECK-LABEL: local_acq_rel -define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_acq_rel_sys +define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -604,7 +3070,7 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e release, align 4 @@ -617,11 +3083,8 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ret void } -; CHECK-LABEL: local_acq_rel_volatile -define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_acq_rel_volatile_sys +define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -648,7 +3111,7 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 @@ -661,11 +3124,172 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ret void } -; CHECK-LABEL: local_seq_cst -define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. +; CHECK-LABEL: local_acq_rel_gpu +define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_gpu +define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_cta +define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_cta +define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 + + ret void +} +; CHECK-LABEL: local_seq_cst_sys +define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 @@ -692,7 +3316,7 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 @@ -705,11 +3329,8 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ret void } -; CHECK-LABEL: local_seq_cst_volatile -define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_seq_cst_volatile_sys +define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 @@ -736,7 +3357,7 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 @@ -746,10 +3367,169 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void +} + +; CHECK-LABEL: local_seq_cst_gpu +define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_volatile_gpu +define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_cta +define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 ret void } -; TODO: add plain,atomic,volatile,atomic volatile tests -; for .const and .param statespaces \ No newline at end of file +; CHECK-LABEL: local_seq_cst_volatile_cta +define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll new file mode 100644 index 00000000000000..645170da51a011 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll @@ -0,0 +1,1423 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} + +; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" +; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile release": generates "st.release.sys;" +; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;" + +; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;" +; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;" +; but should generate "fence.sc.sys; st.relaxed.mmio.sys;" + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: implement and test thread scope. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .shared.sys into .shared.cta or .shared.cluster . + +;; generic statespace + +; CHECK-LABEL: generic_unordered_cluster +define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_cluster +define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_cluster +define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_cluster +define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_cluster +define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_cluster +define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_cluster +define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_volatile_cluster +define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_unordered_cluster +define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_volatile_cluster +define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_cluster +define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile_cluster +define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_cluster +define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_cluster +define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_cluster +define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_cluster +define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; shared + +; CHECK-LABEL: shared_unordered_cluster +define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_cluster +define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_cluster +define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_cluster +define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_cluster +define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_cluster +define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_cluster +define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_cluster +define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_unordered_cluster +define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_cluster +define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_cluster +define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_cluster +define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_cluster +define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_cluster +define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_cluster +define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_volatile_cluster +define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index aac73f71a6766c..f922fd92fa244e 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -9,10 +9,21 @@ ; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . + ; generic statespace -; CHECK-LABEL: generic_plain -define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; CHECK-LABEL: generic_weak +define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 @@ -238,198 +249,198 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ret void } -; CHECK-LABEL: generic_monotonic -define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_sys +define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a monotonic, align 1 + %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a monotonic, align 1 + store atomic i8 %a.add, ptr %a unordered, align 1 ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 + %b.load = load atomic i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 + store atomic i16 %b.add, ptr %b unordered, align 2 ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 + %c.load = load atomic i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 + store atomic i32 %c.add, ptr %c unordered, align 4 ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 + %d.load = load atomic i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 + store atomic i64 %d.add, ptr %d unordered, align 8 ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 + %e.load = load atomic float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 + store atomic float %e.add, ptr %e unordered, align 4 ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 + %f.load = load atomic double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 + store atomic double %f.add, ptr %e unordered, align 8 ret void } -; CHECK-LABEL: generic_monotonic_volatile -define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_volatile_sys +define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr %a unordered, align 1 ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr %b unordered, align 2 ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr %c unordered, align 4 ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr %d unordered, align 8 ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.load = load atomic volatile float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 + store atomic volatile float %e.add, ptr %e unordered, align 4 ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.load = load atomic volatile double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 + store atomic volatile double %f.add, ptr %e unordered, align 8 ret void } -; CHECK-LABEL: generic_unordered -define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic_sys +define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a unordered, align 1 + %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a unordered, align 1 + store atomic i8 %a.add, ptr %a monotonic, align 1 ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 + %b.load = load atomic i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 + store atomic i16 %b.add, ptr %b monotonic, align 2 ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 + %c.load = load atomic i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 + store atomic i32 %c.add, ptr %c monotonic, align 4 ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 + %d.load = load atomic i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 + store atomic i64 %d.add, ptr %d monotonic, align 8 ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 + store atomic float %e.add, ptr %e monotonic, align 4 ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 + %f.load = load atomic double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 + store atomic double %f.add, ptr %e monotonic, align 8 ret void } -; CHECK-LABEL: generic_unordered_volatile -define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic_volatile_sys +define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a unordered, align 1 + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a unordered, align 1 + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 + store atomic volatile float %e.add, ptr %e monotonic, align 4 ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 + %f.load = load atomic volatile double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 + store atomic volatile double %f.add, ptr %e monotonic, align 8 ret void } ;; global statespace -; CHECK-LABEL: global_plain -define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { +; CHECK-LABEL: global_weak +define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 @@ -630,222 +641,222 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ret void } -; CHECK-LABEL: global_monotonic -define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_unordered_sys +define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 ret void } -; CHECK-LABEL: global_monotonic_volatile -define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_unordered_volatile_sys +define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 ret void } -; CHECK-LABEL: global_unordered -define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_monotonic_sys +define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void } -; CHECK-LABEL: global_unordered_volatile -define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_monotonic_volatile_sys +define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void } ;; shared statespace -; CHECK-LABEL: shared_plain -define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { +; CHECK-LABEL: shared_weak +define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 @@ -1046,202 +1057,198 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ret void } -; CHECK-LABEL: shared_monotonic -define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. - +; CHECK-LABEL: shared_unordered_sys +define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 ret void } -; CHECK-LABEL: shared_monotonic_volatile -define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_unordered_volatile_sys +define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 ret void } -; CHECK-LABEL: shared_unordered -define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. - +; CHECK-LABEL: shared_monotonic_sys +define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void } -; CHECK-LABEL: shared_unordered_volatile -define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_monotonic_volatile_sys +define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void } ;; local statespace -; CHECK-LABEL: local_plain -define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { +; CHECK-LABEL: local_weak +define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -1343,9 +1350,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using volatile operations. - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -1445,175 +1449,166 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ret void } -; CHECK-LABEL: local_monotonic -define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_unordered_sys +define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by generating atomic or volatile operations - +; CHECK-LABEL: local_unordered_volatile_sys +define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 ret void } -; CHECK-LABEL: local_unordered -define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_monotonic_sys +define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile -define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 ret void } - -; TODO: add plain,atomic,volatile,atomic volatile tests -; for .const and .param statespaces \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll new file mode 100644 index 00000000000000..52048a0a2065bc --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll @@ -0,0 +1,415 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512F +; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512BW + +define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize__half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize__half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $16, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $16, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize__half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $16, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $16, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize__half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize__half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzwl (%rdi), %eax +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load half, half addrspace(1)* %out + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { +; SSE-LABEL: complex_canonicalize_fmul_half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: complex_canonicalize_fmul_half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX2-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: popq %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: complex_canonicalize_fmul_half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: complex_canonicalize_fmul_half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax +; AVX512BW-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +entry: + + %mul1 = fsub half %a, %b + %add = fadd half %mul1, %b + %mul2 = fsub half %add, %mul1 + %canonicalized = call half @llvm.canonicalize.f16(half %mul2) + %result = fsub half %canonicalized, %b + ret half %result +} + +define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize_v2half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $48, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, 2(%rbx) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $48, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_v2half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $48, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $48, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize_v2half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $48, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $48, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize_v2half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-NEXT: vmovd %xmm0, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize_v2half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll new file mode 100644 index 00000000000000..13ea53389411bc --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll @@ -0,0 +1,636 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mtriple=i686-- --mattr=-sse2 < %s | FileCheck %s -check-prefixes=SSE1 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE2 +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX512F + +define float @canon_fp32_varargsf32(float %a) { +; SSE1-LABEL: canon_fp32_varargsf32: +; SSE1: # %bb.0: +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq + + %canonicalized = call float @llvm.canonicalize.f32(float %a) + ret float %canonicalized +} + +define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) { +; SSE1-LABEL: canon_fp32_varargsf80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: retq + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a) + ret x86_fp80 %canonicalized +} + +define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) { +; SSE1-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE1: # %bb.0: # %entry +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fsub %st(1), %st +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fadd %st(2), %st +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fsub %st(1), %st +; SSE2-NEXT: fld %st(0) +; SSE2-NEXT: fadd %st(2), %st +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_x86_fp80: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fsub %st(1), %st +; AVX1-NEXT: fld %st(0) +; AVX1-NEXT: fadd %st(2), %st +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: retq +entry: + + %mul1 = fsub x86_fp80 %a, %b + %add = fadd x86_fp80 %mul1, %b + %mul2 = fsub x86_fp80 %add, %mul1 + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2) + %result = fsub x86_fp80 %canonicalized, %b + ret x86_fp80 %result +} + +define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp64: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB3_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB3_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB3_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB3_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp64: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp64: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp64: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %c = fcmp olt double %a, %b + %d = fcmp uno double %a, 0.000000e+00 + %or.cond.i.i = or i1 %d, %c + %e = select i1 %or.cond.i.i, double %b, double %a + %f = tail call double @llvm.canonicalize.f64(double %e) #2 + ret double %f +} + +define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp32: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB4_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB4_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB4_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB4_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp32: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp32: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp32: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %cc = fcmp olt float %aa, %bb + %dd = fcmp uno float %aa, 0.000000e+00 + %or.cond.i.i.x = or i1 %dd, %cc + %ee = select i1 %or.cond.i.i.x, float %bb, float %aa + %ff = tail call float @llvm.canonicalize.f32(float %ee) #2 + ret float %ff +} + +define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movss %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: retq + %val = load float, float addrspace(1)* %out + %canonicalized = call float @llvm.canonicalize.f32(float %val) + store float %canonicalized, float addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + + %val = load x86_fp80, x86_fp80 addrspace(1)* %out + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val) + store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movsd %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovsd %xmm0, (%rdi) +; AVX1-NEXT: retq + + %val = load double, double addrspace(1)* %out + %canonicalized = call double @llvm.canonicalize.f64(double %val) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define void @canonicalize_undef(double addrspace(1)* %out) { +; SSE1-LABEL: canonicalize_undef: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000 +; SSE1-NEXT: movl $0, (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canonicalize_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; AVX1-NEXT: movq %rax, (%rdi) +; AVX1-NEXT: retq + + %canonicalized = call double @llvm.canonicalize.f64(double undef) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { +; SSE1-LABEL: canon_fp32_varargsv4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp32_varargsv4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp32_varargsv4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp32_varargsv4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) + ret <4 x float> %canonicalized +} + +define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { +; SSE1-LABEL: canon_fp64_varargsv4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp64_varargsv4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0] +; SSE2-NEXT: mulpd %xmm2, %xmm0 +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp64_varargsv4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp64_varargsv4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %a) + ret <4 x double> %canonicalized +} + +define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) { +; SSE1-LABEL: canon_fp80_varargsv2fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmulp %st, %st(2) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp80_varargsv2fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmulp %st, %st(2) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp80_varargsv2fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmulp %st, %st(2) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: retq + %canonicalized = call <2 x x86_fp80> @llvm.canonicalize.v2f80(<2 x x86_fp80> %a) + ret <2 x x86_fp80> %canonicalized +} + +define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls 4(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls 8(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls 12(%eax) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, (%rdi) +; AVX512F-NEXT: retq + %val = load <4 x float>, <4 x float> addrspace(1)* %out + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %val) + store <4 x float> %canonicalized, <4 x float> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull 8(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull 16(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull 24(%eax) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] +; SSE2-NEXT: movapd 16(%rdi), %xmm1 +; SSE2-NEXT: mulpd %xmm0, %xmm1 +; SSE2-NEXT: mulpd (%rdi), %xmm0 +; SSE2-NEXT: movapd %xmm0, (%rdi) +; SSE2-NEXT: movapd %xmm1, 16(%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovapd %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq + %val = load <4 x double>, <4 x double> addrspace(1)* %out + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %val) + store <4 x double> %canonicalized, <4 x double> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt 30(%eax) +; SSE1-NEXT: fldt 20(%eax) +; SSE1-NEXT: fldt 10(%eax) +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmul %st, %st(2) +; SSE1-NEXT: fmul %st, %st(3) +; SSE1-NEXT: fmulp %st, %st(4) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fstpt 30(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpt 20(%eax) +; SSE1-NEXT: fstpt 10(%eax) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt 30(%rdi) +; SSE2-NEXT: fldt 20(%rdi) +; SSE2-NEXT: fldt 10(%rdi) +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmul %st, %st(2) +; SSE2-NEXT: fmul %st, %st(3) +; SSE2-NEXT: fmulp %st, %st(4) +; SSE2-NEXT: fxch %st(3) +; SSE2-NEXT: fstpt 30(%rdi) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: fstpt 20(%rdi) +; SSE2-NEXT: fstpt 10(%rdi) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt 30(%rdi) +; AVX1-NEXT: fldt 20(%rdi) +; AVX1-NEXT: fldt 10(%rdi) +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmul %st, %st(2) +; AVX1-NEXT: fmul %st, %st(3) +; AVX1-NEXT: fmulp %st, %st(4) +; AVX1-NEXT: fxch %st(3) +; AVX1-NEXT: fstpt 30(%rdi) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: fstpt 20(%rdi) +; AVX1-NEXT: fstpt 10(%rdi) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + %val = load <4 x x86_fp80>, <4 x x86_fp80> addrspace(1)* %out + %canonicalized = call <4 x x86_fp80> @llvm.canonicalize.f80(<4 x x86_fp80> %val) + store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s index 1374417ac354b3..5a9f382d334ee4 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s @@ -215,6 +215,9 @@ buffer_store_d16_hi_format_x v1, off, s[12:15], -1 offset:4095 buffer_store_lds_dword s[4:7], -1 offset:4095 lds // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +buffer_wbinvl1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + buffer_wbinvl1_vol // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index 1e8d7684e942a6..c9756a068890e7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -34,6 +34,9 @@ buffer_invl2 buffer_store_lds_dword s[4:7], -1 offset:4095 lds // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +buffer_wbinvl1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + buffer_wbinvl1_vol // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_unsupported.s b/llvm/test/MC/AMDGPU/gfx12_unsupported.s index f0debbf052bcc1..c34cb9e29c1994 100644 --- a/llvm/test/MC/AMDGPU/gfx12_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx12_unsupported.s @@ -232,9 +232,15 @@ buffer_gl0_inv buffer_gl1_inv // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +buffer_store_lds_dword s[4:7], -1 offset:4095 lds +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + buffer_wbinvl1 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +buffer_wbinvl1_vol +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + flat_atomic_csub v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid instruction diff --git a/llvm/test/MC/AMDGPU/gfx940_unsupported.s b/llvm/test/MC/AMDGPU/gfx940_unsupported.s new file mode 100644 index 00000000000000..4ef53c7d952394 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx940_unsupported.s @@ -0,0 +1,11 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=CHECK --implicit-check-not=error: %s + +buffer_store_lds_dword s[4:7], -1 offset:4095 lds +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +buffer_wbinvl1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +buffer_wbinvl1_vol +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + diff --git a/llvm/test/MC/RISCV/machine-csr-names-invalid.s b/llvm/test/MC/RISCV/machine-csr-names-invalid.s deleted file mode 100644 index 526c41773484ca..00000000000000 --- a/llvm/test/MC/RISCV/machine-csr-names-invalid.s +++ /dev/null @@ -1,40 +0,0 @@ -# RUN: not llvm-mc -triple riscv64 < %s 2>&1 \ -# RUN: | FileCheck -check-prefixes=CHECK-NEED-RV32 %s - -# These machine mode CSR register names are RV32 only. - -csrrs t1, pmpcfg1, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'pmpcfg1' is RV32 only -csrrs t1, pmpcfg3, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'pmpcfg3' is RV32 only - -csrrs t1, mcycleh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mcycleh' is RV32 only -csrrs t1, minstreth, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'minstreth' is RV32 only - -csrrs t1, mhpmcounter3h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter3h' is RV32 only -csrrs t1, mhpmcounter4h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter4h' is RV32 only -csrrs t1, mhpmcounter5h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter5h' is RV32 only -csrrs t1, mhpmcounter6h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter6h' is RV32 only -csrrs t1, mhpmcounter7h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter7h' is RV32 only -csrrs t1, mhpmcounter8h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter8h' is RV32 only -csrrs t1, mhpmcounter9h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter9h' is RV32 only -csrrs t1, mhpmcounter10h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter10h' is RV32 only -csrrs t1, mhpmcounter11h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter11h' is RV32 only -csrrs t1, mhpmcounter12h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter12h' is RV32 only -csrrs t1, mhpmcounter13h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter13h' is RV32 only -csrrs t1, mhpmcounter14h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter14h' is RV32 only -csrrs t1, mhpmcounter15h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter15h' is RV32 only -csrrs t1, mhpmcounter16h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter16h' is RV32 only -csrrs t1, mhpmcounter17h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter17h' is RV32 only -csrrs t1, mhpmcounter18h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter18h' is RV32 only -csrrs t1, mhpmcounter19h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter19h' is RV32 only -csrrs t1, mhpmcounter20h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter20h' is RV32 only -csrrs t1, mhpmcounter21h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter21h' is RV32 only -csrrs t1, mhpmcounter22h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter22h' is RV32 only -csrrs t1, mhpmcounter23h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter23h' is RV32 only -csrrs t1, mhpmcounter24h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter24h' is RV32 only -csrrs t1, mhpmcounter25h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter25h' is RV32 only -csrrs t1, mhpmcounter26h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter26h' is RV32 only -csrrs t1, mhpmcounter27h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter27h' is RV32 only -csrrs t1, mhpmcounter28h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter28h' is RV32 only -csrrs t1, mhpmcounter29h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter29h' is RV32 only -csrrs t1, mhpmcounter30h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter30h' is RV32 only -csrrs t1, mhpmcounter31h, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mhpmcounter31h' is RV32 only diff --git a/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll b/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll new file mode 100644 index 00000000000000..94c912876d7b94 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;; Tests that when we replace a call to strcmp with compiler-generated inline +;; code, we pass the strcmp call's dbg location to the inline code. +; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s + +@.str = constant [3 x i8] c"-h\00" + +define i32 @main() { +; CHECK-LABEL: define i32 @main() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[SUB_0:.*]], !dbg [[DBG4:![0-9]+]] +; CHECK: [[SUB_0]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 45, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0, !dbg [[DBG4]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[NE:.*]], label %[[SUB_1:.*]], !dbg [[DBG4]] +; CHECK: [[SUB_1]]: +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr null, i64 1), align 1, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], 104, !dbg [[DBG4]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0, !dbg [[DBG4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[NE]], label %[[SUB_2:.*]], !dbg [[DBG4]] +; CHECK: [[SUB_2]]: +; CHECK-NEXT: br label %[[NE]], !dbg [[DBG4]] +; CHECK: [[NE]]: +; CHECK-NEXT: br label %[[ENTRY_TAIL:.*]], !dbg [[DBG4]] +; CHECK: [[ENTRY_TAIL]]: +; CHECK-NEXT: ret i32 0 +; +entry: + %call.i = call i32 @strcmp(ptr null, ptr @.str), !dbg !4 + %cmp.i.not = icmp eq i32 %call.i, 0 + ret i32 0 +} + +declare i32 @strcmp(ptr, ptr) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git", isOptimized: true, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2) +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 258, column: 10, scope: !5) +!5 = distinct !DISubprogram(name: "streq", scope: !1, file: !1, line: 257, type: !7, scopeLine: 257, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], retainedTypes: [[META2]], globals: [[META2]]) +; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}}) +; CHECK: [[META2]] = !{} +; CHECK: [[DBG4]] = !DILocation(line: 258, column: 10, scope: [[META5:![0-9]+]]) +; CHECK: [[META5]] = distinct !DISubprogram(name: "streq", scope: [[META1]], file: [[META1]], line: 257, type: [[META6:![0-9]+]], scopeLine: 257, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]]) +; CHECK: [[META6]] = !DISubroutineType(types: [[META2]]) +;. diff --git a/llvm/test/Transforms/InstCombine/compare-3way.ll b/llvm/test/Transforms/InstCombine/compare-3way.ll index e2067368fb4c3e..5d443cd45238c7 100644 --- a/llvm/test/Transforms/InstCombine/compare-3way.ll +++ b/llvm/test/Transforms/InstCombine/compare-3way.ll @@ -15,8 +15,7 @@ define void @test_low_sgt(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp ne i64 [[A]], [[B]] -; CHECK-NEXT: [[RESULT:%.*]] = zext i1 [[EQ]] to i32 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -62,10 +61,7 @@ define void @test_low_sge(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp eq i64 [[A]], [[B]] -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[EQ]], i32 0, i32 [[DOT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -114,8 +110,7 @@ define void @test_low_ne(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp ne i64 [[A]], [[B]] -; CHECK-NEXT: [[RESULT:%.*]] = zext i1 [[EQ]] to i32 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -212,8 +207,7 @@ define void @test_mid_sge(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp ne i64 [[A]], [[B]] -; CHECK-NEXT: [[RESULT:%.*]] = zext i1 [[EQ]] to i32 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -238,10 +232,7 @@ define void @test_mid_sle(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp eq i64 [[A]], [[B]] -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[EQ]], i32 0, i32 [[DOT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -266,9 +257,8 @@ define void @test_mid_ne(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: call void @use(i32 [[DOT]]) +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) +; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; %eq = icmp eq i64 %a, %b @@ -338,10 +328,7 @@ define void @test_high_slt(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp eq i64 [[A]], [[B]] -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[EQ]], i32 0, i32 [[DOT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -389,10 +376,7 @@ define void @test_high_sle(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp eq i64 [[A]], [[B]] -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[EQ]], i32 0, i32 [[DOT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; @@ -417,10 +401,7 @@ define void @test_high_ne(i64 %a, i64 %b) { ; CHECK: normal: ; CHECK-NEXT: ret void ; CHECK: unreached: -; CHECK-NEXT: [[EQ:%.*]] = icmp eq i64 [[A]], [[B]] -; CHECK-NEXT: [[SLT:%.*]] = icmp slt i64 [[A]], [[B]] -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[SLT]], i32 -1, i32 1 -; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[EQ]], i32 0, i32 [[DOT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[A]], i64 [[B]]) ; CHECK-NEXT: call void @use(i32 [[RESULT]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/InstCombine/phi-with-multiple-unsimplifiable-values.ll b/llvm/test/Transforms/InstCombine/phi-with-multiple-unsimplifiable-values.ll index 2b75d5c5475117..cd40aa92ed4fd1 100644 --- a/llvm/test/Transforms/InstCombine/phi-with-multiple-unsimplifiable-values.ll +++ b/llvm/test/Transforms/InstCombine/phi-with-multiple-unsimplifiable-values.ll @@ -133,3 +133,35 @@ exit: %r = icmp slt i8 %phi, 0 ret i1 %r } + +; Same as the first transformation, but the phi node uses the result of scmp twice. This verifies that we don't clone values more than once per block +define i1 @icmp_of_phi_of_scmp_with_constant_one_user_two_uses(i8 %c, i16 %x, i16 %y, i8 %false_val) { +; CHECK-LABEL: define i1 @icmp_of_phi_of_scmp_with_constant_one_user_two_uses( +; CHECK-SAME: i8 [[C:%.*]], i16 [[X:%.*]], i16 [[Y:%.*]], i8 [[FALSE_VAL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i16 [[X]], [[Y]] +; CHECK-NEXT: switch i8 [[C]], label %[[BB_2:.*]] [ +; CHECK-NEXT: i8 0, label %[[BB:.*]] +; CHECK-NEXT: i8 1, label %[[BB]] +; CHECK-NEXT: ] +; CHECK: [[BB_2]]: +; CHECK-NEXT: br label %[[BB]] +; CHECK: [[BB]]: +; CHECK-NEXT: [[R:%.*]] = phi i1 [ [[TMP0]], %[[ENTRY]] ], [ [[TMP0]], %[[ENTRY]] ], [ false, %[[BB_2]] ] +; CHECK-NEXT: ret i1 [[R]] +; +entry: + %cmp = call i8 @llvm.scmp(i16 %x, i16 %y) + switch i8 %c, label %bb_2 [ + i8 0, label %bb + i8 1, label %bb + ] + +bb_2: + br label %bb + +bb: + %phi = phi i8 [ %cmp, %entry ], [ %cmp, %entry ], [ 0, %bb_2 ] + %r = icmp slt i8 %phi, 0 + ret i1 %r +} diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll index 123bc647462337..2140a59de3fa90 100644 --- a/llvm/test/Transforms/InstCombine/scmp.ll +++ b/llvm/test/Transforms/InstCombine/scmp.ll @@ -343,3 +343,133 @@ define i8 @scmp_from_select_gt_and_lt(i32 %x, i32 %y) { %r = select i1 %gt, i8 1, i8 %lt ret i8 %r } + +; (x == y) ? 0 : (x s> y ? 1 : -1) into scmp(x, y) +define i8 @scmp_from_select_eq_and_gt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 -1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} + +define i8 @scmp_from_select_eq_and_gt_inverse(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_inverse( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %ne = icmp ne i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 -1 + %r = select i1 %ne, i8 %sel1, i8 0 + ret i8 %r +} + +define <4 x i8> @scmp_from_select_eq_and_gt_vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i8> @scmp_from_select_eq_and_gt_vec( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call <4 x i8> @llvm.scmp.v4i8.v4i32(<4 x i32> [[X]], <4 x i32> [[Y]]) +; CHECK-NEXT: ret <4 x i8> [[R]] +; + %eq = icmp eq <4 x i32> %x, %y + %gt = icmp sgt <4 x i32> %x, %y + %sel1 = select <4 x i1> %gt, <4 x i8> splat(i8 1), <4 x i8> splat(i8 -1) + %r = select <4 x i1> %eq, <4 x i8> splat(i8 0), <4 x i8> %sel1 + ret <4 x i8> %r +} + +define i8 @scmp_from_select_eq_and_gt_commuted1(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_commuted1( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]]) +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp slt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 -1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} + +define i8 @scmp_from_select_eq_and_gt_commuted2(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_commuted2( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]]) +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 -1, i8 1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} + +define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_commuted3( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp slt i32 %x, %y + %sel1 = select i1 %gt, i8 -1, i8 1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} + +; Negative test: true value of outer select is not zero +define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[EQ:%.*]] = icmp eq i32 [[X]], [[Y]] +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[X]], [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[GT]], i8 1, i8 -1 +; CHECK-NEXT: [[R:%.*]] = select i1 [[EQ]], i8 5, i8 [[SEL1]] +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 -1 + %r = select i1 %eq, i8 5, i8 %sel1 + ret i8 %r +} + +; Negative test: true value of inner select is not 1 or -1 +define i8 @scmp_from_select_eq_and_gt_neg2(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg2( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[EQ:%.*]] = icmp eq i32 [[X]], [[Y]] +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[X]], [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[GT]], i8 2, i8 -1 +; CHECK-NEXT: [[R:%.*]] = select i1 [[EQ]], i8 0, i8 [[SEL1]] +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 2, i8 -1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} + +; Negative test: false value of inner select is not 1 or -1 +define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg3( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[EQ:%.*]] = icmp eq i32 [[X]], [[Y]] +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[X]], [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[GT]], i8 1, i8 22 +; CHECK-NEXT: [[R:%.*]] = select i1 [[EQ]], i8 0, i8 [[SEL1]] +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp sgt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 22 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} diff --git a/llvm/test/Transforms/InstCombine/select-select.ll b/llvm/test/Transforms/InstCombine/select-select.ll index 1feae5ab504dcf..94e88c2f6cbe6c 100644 --- a/llvm/test/Transforms/InstCombine/select-select.ll +++ b/llvm/test/Transforms/InstCombine/select-select.ll @@ -177,10 +177,7 @@ define <2 x i8> @sel_shuf_narrowing_commute2(<4 x i8> %x, <4 x i8> %y, <2 x i8> define i8 @strong_order_cmp_slt_eq(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_slt_eq( -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_LT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.lt = icmp slt i32 %a, %b @@ -192,10 +189,7 @@ define i8 @strong_order_cmp_slt_eq(i32 %a, i32 %b) { define i8 @strong_order_cmp_ult_eq(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_ult_eq( -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_LT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.lt = icmp ult i32 %a, %b @@ -252,10 +246,7 @@ define i8 @strong_order_cmp_slt_ult_wrong_pred(i32 %a, i32 %b) { define i8 @strong_order_cmp_sgt_eq(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_sgt_eq( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 -1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_GT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.gt = icmp sgt i32 %a, %b @@ -267,10 +258,7 @@ define i8 @strong_order_cmp_sgt_eq(i32 %a, i32 %b) { define i8 @strong_order_cmp_ugt_eq(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_ugt_eq( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 -1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_GT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.gt = icmp ugt i32 %a, %b @@ -395,9 +383,7 @@ define i8 @strong_order_cmp_slt_eq_slt_not_oneuse(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_slt_eq_slt_not_oneuse( ; CHECK-NEXT: [[CMP_LT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: call void @use1(i1 [[CMP_LT]]) -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_LT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.lt = icmp slt i32 %a, %b @@ -410,11 +396,9 @@ define i8 @strong_order_cmp_slt_eq_slt_not_oneuse(i32 %a, i32 %b) { define i8 @strong_order_cmp_sgt_eq_eq_not_oneuse(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_sgt_eq_eq_not_oneuse( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 -1 -; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A]], [[B]] +; CHECK-NEXT: [[CMP_EQ:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: call void @use1(i1 [[CMP_EQ]]) -; CHECK-NEXT: [[SEL_EQ:%.*]] = select i1 [[CMP_EQ]], i8 0, i8 [[SEL_GT]] +; CHECK-NEXT: [[SEL_EQ:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: ret i8 [[SEL_EQ]] ; %cmp.gt = icmp sgt i32 %a, %b diff --git a/llvm/test/Transforms/InstCombine/sink_to_unreachable.ll b/llvm/test/Transforms/InstCombine/sink_to_unreachable.ll index 01510f8e9596b5..72aa6dc80df34c 100644 --- a/llvm/test/Transforms/InstCombine/sink_to_unreachable.ll +++ b/llvm/test/Transforms/InstCombine/sink_to_unreachable.ll @@ -10,8 +10,7 @@ define void @test_01(i32 %x, i32 %y) { ; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br i1 [[C2]], label [[EXIT:%.*]], label [[UNREACHED:%.*]] ; CHECK: unreached: -; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[X]], [[Y]] -; CHECK-NEXT: [[COMPARATOR:%.*]] = zext i1 [[C1]] to i32 +; CHECK-NEXT: [[COMPARATOR:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 [[Y]]) ; CHECK-NEXT: call void @use(i32 [[COMPARATOR]]) ; CHECK-NEXT: unreachable ; CHECK: exit: @@ -42,8 +41,7 @@ define void @test_02(i32 %x, i32 %y) { ; CHECK-NEXT: [[C3:%.*]] = icmp sgt i32 [[X]], [[Y]] ; CHECK-NEXT: br i1 [[C3]], label [[EXIT]], label [[UNREACHED:%.*]] ; CHECK: unreached: -; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[X]], [[Y]] -; CHECK-NEXT: [[COMPARATOR:%.*]] = zext i1 [[C1]] to i32 +; CHECK-NEXT: [[COMPARATOR:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 [[Y]]) ; CHECK-NEXT: call void @use(i32 [[COMPARATOR]]) ; CHECK-NEXT: unreachable ; CHECK: exit: @@ -77,8 +75,7 @@ define i32 @test_03(i32 %x, i32 %y) { ; CHECK-NEXT: [[C3:%.*]] = icmp sgt i32 [[X]], [[Y]] ; CHECK-NEXT: br i1 [[C3]], label [[EXIT]], label [[UNREACHED:%.*]] ; CHECK: unreached: -; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[X]], [[Y]] -; CHECK-NEXT: [[COMPARATOR:%.*]] = zext i1 [[C1]] to i32 +; CHECK-NEXT: [[COMPARATOR:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 [[Y]]) ; CHECK-NEXT: ret i32 [[COMPARATOR]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/InstCombine/ucmp.ll b/llvm/test/Transforms/InstCombine/ucmp.ll index 13755f13bb0a11..2d5036019740cd 100644 --- a/llvm/test/Transforms/InstCombine/ucmp.ll +++ b/llvm/test/Transforms/InstCombine/ucmp.ll @@ -541,3 +541,17 @@ define i8 @ucmp_from_select_gt_and_lt(i32 %x, i32 %y) { %r = select i1 %gt, i8 1, i8 %lt ret i8 %r } + +; (x == y) ? 0 : (x u> y ? 1 : -1) into ucmp(x, y) +define i8 @scmp_from_select_eq_and_gt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %eq = icmp eq i32 %x, %y + %gt = icmp ugt i32 %x, %y + %sel1 = select i1 %gt, i8 1, i8 -1 + %r = select i1 %eq, i8 0, i8 %sel1 + ret i8 %r +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll index 498f2059ffb0c4..7797c0bce0baa4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -54,9 +54,9 @@ for.end: ; preds = %for.body, %entry define void @vec_ptr(i64 %N, ptr noalias %a, ptr readnone %b) { ; CHECK-LABEL: @vec_ptr -; CHECK: vector.body: -; CHECK: %[[LOAD:.*]] = load , ptr -; CHECK: call @bar_vec( %[[LOAD]]) +; CHECK: for.body: +; CHECK: %[[LOAD:.*]] = load ptr, ptr +; CHECK: call i64 @bar(ptr %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll new file mode 100644 index 00000000000000..92b043a9c29d50 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -vectorizer-maximize-bandwidth -force-vector-interleave=1 -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +target triple = "aarch64-unknown-linux-gnu" + +;; Make sure we reject scalable vectors for fp128 types. We were previously +;; crashing before reaching the cost model when checking for the number of +;; registers required for a when trying to maximize +;; vector bandwidth with SVE. + +; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128 + +define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) { +; CHECK-LABEL: define void @load_ext_trunc_store( +; CHECK-SAME: ptr readonly [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x double> [[WIDE_LOAD]] to <4 x fp128> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fptrunc <4 x fp128> [[TMP3]] to <4 x float> +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_IN:%.*]] = getelementptr inbounds nuw double, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_IN:%.*]] = load double, ptr [[GEP_IN]], align 8 +; CHECK-NEXT: [[LOAD_EXT:%.*]] = fpext double [[LOAD_IN]] to fp128 +; CHECK-NEXT: [[GEP_OUT:%.*]] = getelementptr inbounds nuw float, ptr [[OUT]], i64 [[IV]] +; CHECK-NEXT: [[TRUNC_OUT:%.*]] = fptrunc fp128 [[LOAD_EXT]] to float +; CHECK-NEXT: store float [[TRUNC_OUT]], ptr [[GEP_OUT]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.in = getelementptr inbounds nuw double, ptr %in, i64 %iv + %load.in = load double, ptr %gep.in, align 8 + %load.ext = fpext double %load.in to fp128 + %gep.out = getelementptr inbounds nuw float, ptr %out, i64 %iv + %trunc.out = fptrunc fp128 %load.ext to float + store float %trunc.out, ptr %gep.out, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv.next, %N + br i1 %exitcond, label %for.body, label %for.exit + +for.exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll index d9cc630482fc80..41ccb3c404dd76 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll @@ -2902,35 +2902,36 @@ define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ret void } +; FIXME: Re-enable modf[f] vectorization once aliasing issues due to output +; pointers have been resolved. + declare double @modf(double, ptr) declare float @modff(float, ptr) define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @modf_f64 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vl8_modf(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; SLEEF-NEON: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; SLEEF-SVE-LABEL: define void @modf_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SLEEF-SVE: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @modf_f64 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: [[TMP17:%.*]] = call @_ZGVsNxvl8_modf( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) -; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR64:[0-9]+]] +; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-NEON-LABEL: define void @modf_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vmodfq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; ARMPL-NEON: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-SVE-LABEL: define void @modf_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: [[TMP23:%.*]] = call @armpl_svmodf_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @modf_f64 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: [[TMP17:%.*]] = call @armpl_svmodf_f64_x( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR64:[0-9]+]] +; ARMPL-SVE-NOPRED: [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]]) ; entry: br label %for.body @@ -2954,29 +2955,27 @@ for.cond.cleanup: define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @modf_f32 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vl4_modff(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; SLEEF-NEON: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; SLEEF-SVE-LABEL: define void @modf_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SLEEF-SVE: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @modf_f32 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: [[TMP17:%.*]] = call @_ZGVsNxvl4_modff( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]]) -; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR65:[0-9]+]] +; SLEEF-SVE-NOPRED: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-NEON-LABEL: define void @modf_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vmodfq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) +; ARMPL-NEON: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-SVE-LABEL: define void @modf_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: [[TMP23:%.*]] = call @armpl_svmodf_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @modf_f32 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: [[TMP17:%.*]] = call @armpl_svmodf_f32_x( [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) #[[ATTR65:[0-9]+]] +; ARMPL-SVE-NOPRED: [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]]) ; entry: br label %for.body @@ -3276,35 +3275,36 @@ define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ret void } +; FIXME: Re-enable sincos[f] vectorization once aliasing issues with output +; pointers have been resolved. + declare void @sincos(double, ptr, ptr) declare void @sincosf(float, ptr, ptr) define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @sincos_f64 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: call void @_ZGVnN2vl8l8_sincos(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; SLEEF-NEON: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-LABEL: define void @sincos_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR6:[0-9]+]] +; SLEEF-SVE: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f64 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl8l8_sincos( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) -; SLEEF-SVE-NOPRED: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR72:[0-9]+]] +; SLEEF-SVE-NOPRED: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-NEON-LABEL: define void @sincos_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: call void @armpl_vsincosq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; ARMPL-NEON: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-LABEL: define void @sincos_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincos_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f64 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: call void @armpl_svsincos_f64_x( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR72:[0-9]+]] +; ARMPL-SVE-NOPRED: call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; entry: br label %for.body @@ -3327,29 +3327,27 @@ for.cond.cleanup: define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @sincos_f32 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: call void @_ZGVnN4vl4l4_sincosf(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; SLEEF-NEON: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-LABEL: define void @sincos_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR7:[0-9]+]] +; SLEEF-SVE: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @sincos_f32 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl4l4_sincosf( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) -; SLEEF-SVE-NOPRED: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR73:[0-9]+]] +; SLEEF-SVE-NOPRED: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-NEON-LABEL: define void @sincos_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: call void @armpl_vsincosq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; ARMPL-NEON: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-LABEL: define void @sincos_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincos_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @sincos_f32 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: call void @armpl_svsincos_f32_x( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR73:[0-9]+]] +; ARMPL-SVE-NOPRED: call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; entry: br label %for.body @@ -3369,35 +3367,36 @@ for.cond.cleanup: ret void } +; FIXME: Re-enable sincospi[f] vectorization once aliasing issues with output +; pointers have been resolved. + declare void @sincospi(double, ptr, ptr) declare void @sincospif(float, ptr, ptr) define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @sincospi_f64 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: call void @_ZGVnN2vl8l8_sincospi(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; SLEEF-NEON: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-LABEL: define void @sincospi_f64 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR8:[0-9]+]] +; SLEEF-SVE: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f64 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl8l8_sincospi( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) -; SLEEF-SVE-NOPRED: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR74:[0-9]+]] +; SLEEF-SVE-NOPRED: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-NEON-LABEL: define void @sincospi_f64 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: call void @armpl_vsincospiq_f64(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; ARMPL-NEON: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-LABEL: define void @sincospi_f64 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincospi_f64_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f64 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: call void @armpl_svsincospi_f64_x( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR74:[0-9]+]] +; ARMPL-SVE-NOPRED: call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; entry: br label %for.body @@ -3420,29 +3419,27 @@ for.cond.cleanup: define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; SLEEF-NEON-LABEL: define void @sincospi_f32 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-NEON: call void @_ZGVnN4vl4l4_sincospif(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; SLEEF-NEON: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-LABEL: define void @sincospi_f32 ; SLEEF-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR9:[0-9]+]] +; SLEEF-SVE: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; SLEEF-SVE-NOPRED-LABEL: define void @sincospi_f32 ; SLEEF-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; SLEEF-SVE-NOPRED: call void @_ZGVsNxvl4l4_sincospif( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]]) -; SLEEF-SVE-NOPRED: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR75:[0-9]+]] +; SLEEF-SVE-NOPRED: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-NEON-LABEL: define void @sincospi_f32 ; ARMPL-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-NEON: call void @armpl_vsincospiq_f32(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]]) +; ARMPL-NEON: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-LABEL: define void @sincospi_f32 ; ARMPL-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE: call void @armpl_svsincospi_f32_x( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP23:%.*]], ptr [[TMP24:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; ARMPL-SVE: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; ; ARMPL-SVE-NOPRED-LABEL: define void @sincospi_f32 ; ARMPL-SVE-NOPRED-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { -; ARMPL-SVE-NOPRED: call void @armpl_svsincospi_f32_x( [[WIDE_LOAD:%.*]], ptr [[TMP17:%.*]], ptr [[TMP18:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; ARMPL-SVE-NOPRED: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) #[[ATTR75:[0-9]+]] +; ARMPL-SVE-NOPRED: call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]]) ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll index f60ab5e848dd3a..29904a7822131b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll @@ -13,48 +13,27 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1:%.*]], i32 0 -; NEON: [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; ; NEON_INTERLEAVE-LABEL: define void @test_linear8 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_foo_linear8_nomask_sve(ptr [[TMP14]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8 ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_OR_NEON_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_TF: [[TMP20:%.*]] = extractelement [[TMP19:%.*]], i32 0 -; SVE_TF: [[TMP21:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP20]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF: [[TMP25:%.*]] = extractelement [[TMP24:%.*]], i32 0 -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8 ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_TF_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; entry: br label %for.body @@ -76,35 +55,27 @@ for.cond.cleanup: define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_vector_linear4 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0 -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4]]) ; NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; NEON_INTERLEAVE-LABEL: define void @test_vector_linear4 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_vector_linear4 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP15:%.*]], i32 0 -; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_baz_vector_linear4_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_vector_linear4 ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_vector_linear4 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_vector_linear4 ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; entry: br label %for.body @@ -132,9 +103,7 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; ; NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP4:%.*]] = call i64 @foo(ptr [[TMP2:%.*]]) #[[ATTR2:[0-9]+]] -; NEON_INTERLEAVE: [[TMP5:%.*]] = call i64 @foo(ptr [[TMP3:%.*]]) #[[ATTR2]] -; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] +; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8_bad_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { @@ -142,15 +111,15 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8_bad_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_bad_stride ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; entry: br label %for.body @@ -172,35 +141,27 @@ for.cond.cleanup: define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear16_wide_stride ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 -; NEON: [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; ; NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) -; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP5:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP9:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP8]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = extractelement [[TMP14:%.*]], i32 0 -; SVE_OR_NEON: [[TMP16:%.*]] = call @vec_foo_linear16_nomask_sve(ptr [[TMP15]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3]] ; ; SVE_TF-LABEL: define void @test_linear16_wide_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear16_wide_stride ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3]] ; entry: br label %for.body @@ -223,57 +184,27 @@ for.cond.cleanup: define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_linear4_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = extractelement <4 x ptr> [[TMP1:%.*]], i32 0 -; NEON: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3]], ptr [[TMP4]]) ; NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8 ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) -; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP5:%.*]], i32 0 -; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP9]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = extractelement [[TMP13:%.*]], i32 0 -; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP14:%.*]], i32 0 -; SVE_OR_NEON: [[TMP17:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15]], ptr [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8 ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear4_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[TMP21:%.*]] = extractelement [[TMP19:%.*]], i32 0 -; SVE_TF: [[TMP22:%.*]] = extractelement [[TMP20:%.*]], i32 0 -; SVE_TF: [[TMP23:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21]], ptr [[TMP22]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF: [[TMP27:%.*]] = extractelement [[TMP26:%.*]], i32 0 -; SVE_TF: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear4_linear8 ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; entry: br label %for.body @@ -310,21 +241,21 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) { ; ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linear3_nomask_sve(i32 [[TMP14]]) +; SVE_OR_NEON: [[TMP13:%.*]] = extractelement [[TMP12:%.*]], i32 0 +; SVE_OR_NEON: [[TMP14:%.*]] = call @vec_bar_linear3_nomask_sve(i32 [[TMP13]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear3_non_ptr ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear3_non_ptr ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear3_non_ptr ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR5:[0-9]+]] ; entry: br label %for.body @@ -361,21 +292,21 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) { ; ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP14:%.*]] = extractelement [[TMP13:%.*]], i32 0 -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_bar_linearn5_nomask_sve(i32 [[TMP14]]) +; SVE_OR_NEON: [[TMP13:%.*]] = extractelement [[TMP12:%.*]], i32 0 +; SVE_OR_NEON: [[TMP14:%.*]] = call @vec_bar_linearn5_nomask_sve(i32 [[TMP13]]) ; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]] +; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR6:[0-9]+]] ; entry: br label %for.body @@ -398,48 +329,27 @@ for.cond.cleanup: define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) { ; NEON-LABEL: define void @test_linear8_return_void ; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0 -; NEON: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4]]) ; NEON: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] ; ; NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void ; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { -; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0 -; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP7:%.*]], i32 0 -; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP9]]) ; NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void ; SVE_OR_NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP16:%.*]] = extractelement [[TMP15:%.*]], i32 0 -; SVE_OR_NEON: call void @vec_goo_linear8_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP16]]) ; SVE_OR_NEON: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR8:[0-9]+]] ; ; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 -; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] +; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR7:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8_return_void ; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[TMP22:%.*]] = extractelement [[TMP21:%.*]], i32 0 -; SVE_TF: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF: [[TMP24:%.*]] = extractelement [[TMP23:%.*]], i32 0 -; SVE_TF: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] +; SVE_TF: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR7:[0-9]+]] ; ; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_return_void ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 -; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 -; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 -; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] +; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR7:[0-9]+]] ; entry: br label %for.body @@ -458,6 +368,9 @@ for.cond.cleanup: ret void } +; Note: Vectorizing pointer arguments is currently disabled as LAA cannot detect +; aliasing from output/input pointers. + declare i64 @foo(ptr) declare i32 @baz(i32, ptr) declare i32 @quux(ptr, ptr) diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll index 936c07b4853a38..49454ae18db79d 100644 --- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll @@ -7,10 +7,9 @@ declare void @init_mem(ptr, i64); define i64 @same_exit_block_pre_inc_use1() { ; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 @@ -1089,8 +1088,7 @@ loop.end: define i64 @loop_contains_safe_call() { ; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_call' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! ; CHECK-LABEL: define i64 @loop_contains_safe_call() { ; CHECK-NEXT: entry: @@ -1193,8 +1191,7 @@ loop.end: define i64 @loop_contains_safe_div() { ; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_div' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! ; CHECK-LABEL: define i64 @loop_contains_safe_div() { ; CHECK-NEXT: entry: @@ -1347,10 +1344,9 @@ loop.end: define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { ; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( ; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { ; CHECK-NEXT: entry: @@ -1623,10 +1619,9 @@ loop.end: ; The form of the induction variables requires SCEV predicates. define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) +; DEBUG: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( ; CHECK-SAME: i32 [[END:%.*]]) { ; CHECK-NEXT: entry: @@ -1695,9 +1690,8 @@ declare void @abort() ; early is loop invariant. define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { ; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond' -; DEBUG: LV: Found an early exit. Retrying with speculative exit count. -; DEBUG-NEXT: LV: Found speculative backedge taken count: 275 -; DEBUG: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 275 +; DEBUG: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond( ; CHECK-SAME: ptr [[S:%.*]]) { ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll new file mode 100644 index 00000000000000..6b0b22b90510c1 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i16 @test() { +; CHECK-LABEL: define i16 @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i16 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = or i16 [[OP_RDX]], 0 +; CHECK-NEXT: ret i16 [[OP_RDX1]] +; +entry: + %subi = add i16 0, 0 + %sub40.i = add i16 %subi, 0 + %sub41.i = add i16 %subi, 0 + %sub42.i = add i16 %subi, 0 + %sub43.i = add i16 %subi, 0 + %sub44.i = add i16 %subi, 0 + %sub45.i = add i16 %subi, 0 + %sub46.i = add i16 0, 0 + %sub47.i = add i16 0, 0 + %sub48.i = add i16 0, 0 + %sub49.i = add i16 0, 0 + %or40.i = or i16 %sub40.i, %sub41.i + %or41.i = or i16 %or40.i, %sub42.i + %or42.i = or i16 %or41.i, %sub43.i + %or43.i = or i16 %or42.i, %sub44.i + %or44.i = or i16 %or43.i, %sub45.i + %or45.i = or i16 %or44.i, %sub46.i + %or46.i = or i16 %or45.i, %sub47.i + %or47.i = or i16 %or46.i, %sub48.i + %or48.i = or i16 %or47.i, %sub49.i + %or50.i = or i16 %or48.i, %subi + %subii = add i16 0, 0 + %subi16.i = add i16 %subii, 0 + %subi17.i = add i16 %subii, 0 + %0 = or i16 %subi16.i, %subi17.i + %1 = or i16 %0, %or50.i + ret i16 %1 +} diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll index cbac4569ae6011..371b23019498d1 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -slp-threshold=-1000 | FileCheck %s -; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -passes=slp-vectorizer -S -slp-threshold=-1000 | FileCheck %s +; RUN: %if aarch64-registered-target %{ opt < %s -mtriple=aarch64-unknown-linux-gnu -passes=slp-vectorizer -S -slp-threshold=-1000 | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -slp-threshold=-1000 | FileCheck %s %} define void @test() { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index baf94e44b3a52d..c250029519590f 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define <2 x i32> @test(i32 %arg) { ; CHECK-LABEL: define <2 x i32> @test( diff --git a/llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll b/llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll index dc6b0241f45472..3e45ace2417780 100644 --- a/llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 | FileCheck %s -; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -mtriple=aarch64-unknown-linux-gnu -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 | FileCheck %s %} define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { ; CHECK-LABEL: @sdiv_v8i32_undefs( diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll index bad0a28a8a732b..a5b1e9b4575f05 100644 --- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -; RUN: opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %} + define i1 @test(i32 %0, i32 %1, i32 %p) { ; CHECK-LABEL: define i1 @test( diff --git a/llvm/test/Transforms/SLPVectorizer/buildvector-insert-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/buildvector-insert-mask-size.ll index 9704fc2ca91d04..be7b00903743ef 100644 --- a/llvm/test/Transforms/SLPVectorizer/buildvector-insert-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/buildvector-insert-mask-size.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-1 | FileCheck %s -; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-1 | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-1 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-1 | FileCheck %s %} define void @test() { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll b/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll index 5f63a31dc1f73a..36abe96567bb2d 100644 --- a/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll +++ b/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s -; RUN: opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define double @test() { ; CHECK-LABEL: define double @test() { diff --git a/llvm/test/Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll index 5cd87ab2a3750c..f0d5629cfc22e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-pc-windows-msvc19.34.0 < %s | FileCheck %s -; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-pc-windows-msvc19.34.0 < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define void @test(ptr %0, i8 %1, i1 %cmp12.i) { ; CHECK-LABEL: define void @test( diff --git a/llvm/test/Transforms/SLPVectorizer/catchswitch.ll b/llvm/test/Transforms/SLPVectorizer/catchswitch.ll index f228d197a0eadf..2cd555fa9373b1 100644 --- a/llvm/test/Transforms/SLPVectorizer/catchswitch.ll +++ b/llvm/test/Transforms/SLPVectorizer/catchswitch.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-pc-windows-msvc19.29.30145 < %s | FileCheck %s -; RUN: opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-pc-windows-msvc19.29.30145 < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} ; This used to crash in SLP vectorization when attempting to set the ; IRBuilder's insertion point to the end of a catchswitch block, which diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll index 58a41848b3970a..793d089404d1e3 100644 --- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %} define void @exceed(double %0, double %1) { ; CHECK-LABEL: @exceed( diff --git a/llvm/test/Transforms/SLPVectorizer/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/diamond_broadcast.ll index 3b8ced84bff896..6fe286fbbff61f 100644 --- a/llvm/test/Transforms/SLPVectorizer/diamond_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/diamond_broadcast.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-1 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-1 | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-1 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-1 | FileCheck %s %} define i32 @diamond_broadcast(ptr noalias nocapture %B, ptr noalias nocapture %A) { ; CHECK-LABEL: @diamond_broadcast( diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll index 6ea0cf290ffc82..03db1bba7d22a0 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll @@ -672,8 +672,8 @@ if.false: ret void } -define i32 @str_transcode0(i1 %cond1, ptr %p, i1 %cond2) { -; CHECK-LABEL: @str_transcode0( +define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) { +; CHECK-LABEL: @succ_phi_has_3input( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]] ; CHECK: bb1: @@ -728,6 +728,37 @@ if.true: ret i32 %res } +define i32 @succ1to0_phi3(ptr %p, ptr %p2, i32 %x) { +; CHECK-LABEL: @succ1to0_phi3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[P:%.*]], null +; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[COND]], true +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[X:%.*]] to <1 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32> +; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]]) +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[SPEC_SELECT]], [[TMP4]] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cond = icmp eq ptr %p, null + br i1 %cond, label %if.true, label %if.false + +if.false: + %0 = load i32, ptr %p + store i32 %0, ptr %p2 + br label %if.true + +if.true: + %res0 = phi i32 [ %0, %if.false ], [ 0, %entry ] + %res1 = phi i32 [ %0, %if.false ], [ %x, %entry ] + %res = add i32 %res0, %res1 + ret i32 %res +} + declare i32 @read_memory_only() readonly nounwind willreturn speculatable !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll index 845c5008e3837b..9549ccdbfe9ec4 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll @@ -2118,6 +2118,31 @@ cond.end: ; preds = %entry, %cond.false ret i8 %conv } +define i1 @linearmap_trunc_smaller_table_size(i8 %arg) { +; CHECK-LABEL: @linearmap_trunc_smaller_table_size( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[ARG:%.*]], 10 +; CHECK-NEXT: [[SWITCH_IDX_CAST:%.*]] = trunc i8 [[ARG]] to i1 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_IDX_CAST]], i1 false +; CHECK-NEXT: ret i1 [[SPEC_SELECT]] +; +entry: + switch i8 %arg, label %exit [ + i8 1, label %sw + i8 3, label %sw + i8 5, label %sw + i8 7, label %sw + i8 9, label %sw + ] + +sw: + br label %exit + +exit: + %phi = phi i1 [ true, %sw ], [ false, %entry ] + ret i1 %phi +} + ; Don't create a table with an unknown type define { i8, i8 } @test_unknown_result_type(i8 %n) { ; CHECK-LABEL: @test_unknown_result_type( diff --git a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll index 7ccc14cc0b125e..f3e5d273e88cca 100644 --- a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll +++ b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll @@ -48,10 +48,9 @@ entry: define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: ret <8 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0) +; CHECK-NEXT: ret <8 x i1> [[TMP3]] ; entry: %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 1e0dd0a7df34f1..5a03a85386e0aa 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -364,6 +364,14 @@ def version_int(ver): config.available_features.add("llvm-64-bits") config.available_features.add("host-byteorder-" + sys.byteorder + "-endian") +if config.target_triple: + if re.match( + r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*", + config.target_triple, + ): + config.available_features.add("target-byteorder-big-endian") + else: + config.available_features.add("target-byteorder-little-endian") if sys.platform in ["win32"]: # ExecutionEngine, no weak symbols in COFF. diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll new file mode 100644 index 00000000000000..63d9d5c90d4b44 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll @@ -0,0 +1,11 @@ +; Example input for update_test_checks (taken from test/Transforms/SLPVectorizer/extractlements-gathered-first-node.ll) +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} + +define void @test() { +bb: + %0 = extractelement <4 x i32> zeroinitializer, i32 0 + %1 = extractelement <2 x i32> zeroinitializer, i32 0 + %icmp = icmp ult i32 %0, %1 + ret void +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll.expected new file mode 100644 index 00000000000000..a744acd53f9822 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/if_target.ll.expected @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; Example input for update_test_checks (taken from test/Transforms/SLPVectorizer/extractlements-gathered-first-node.ll) +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> zeroinitializer, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> zeroinitializer, i32 0 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: ret void +; +bb: + %0 = extractelement <4 x i32> zeroinitializer, i32 0 + %1 = extractelement <2 x i32> zeroinitializer, i32 0 + %icmp = icmp ult i32 %0, %1 + ret void +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/if_target.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/if_target.test new file mode 100644 index 00000000000000..3d8427b943c654 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/if_target.test @@ -0,0 +1,6 @@ +## Basic test checking that update_test_checks.py works correctly with %if in RUN line +# RUN: cp -f %S/Inputs/if_target.ll %t.ll && %update_test_checks %t.ll --version 4 +# RUN: diff -u %t.ll %S/Inputs/if_target.ll.expected +## Check that running the script again does not change the result: +# RUN: %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/if_target.ll.expected \ No newline at end of file diff --git a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp index 1f4404aaa391fc..699fcf8a7dbcd6 100644 --- a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp +++ b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp @@ -98,45 +98,6 @@ static void parseArgs(int argc, char **argv) { exit(1); } -/* -cl::OptionCategory DebuginfodFindCategory("llvm-debuginfod-find Options"); - -cl::opt InputBuildID(cl::Positional, cl::Required, - cl::desc(""), cl::init("-"), - cl::cat(DebuginfodFindCategory)); - -static cl::opt - FetchExecutable("executable", cl::init(false), - cl::desc("If set, fetch a binary file associated with this " - "build id, containing the executable sections."), - cl::cat(DebuginfodFindCategory)); - -static cl::opt - FetchDebuginfo("debuginfo", cl::init(false), - cl::desc("If set, fetch a binary file associated with this " - "build id, containing the debuginfo sections."), - cl::cat(DebuginfodFindCategory)); - -static cl::opt FetchSource( - "source", cl::init(""), - cl::desc("Fetch a source file associated with this build id, which is at " - "this relative path relative to the compilation directory."), - cl::cat(DebuginfodFindCategory)); - -static cl::opt - DumpToStdout("dump", cl::init(false), - cl::desc("If set, dumps the contents of the fetched artifact " - "to standard output. Otherwise, dumps the absolute " - "path to the cached artifact on disk."), - cl::cat(DebuginfodFindCategory)); - -static cl::list DebugFileDirectory( - "debug-file-directory", - cl::desc("Path to directory where to look for debug files."), - cl::cat(DebuginfodFindCategory)); - -*/ - ExitOnError ExitOnDebuginfodFindError; static std::string fetchDebugInfo(object::BuildIDRef BuildID); diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 27ad7508756f10..9116b5ced02748 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -396,7 +396,12 @@ class SubProcessFunctionExecutorImpl // platforms have different definitions for some of the libc functions that // cause buildtime failures. Additionally, the subprocess executor mode (the // sole mode where this is supported) currently only supports x86_64. -#if defined(__x86_64__) + +// Also check that we have the SYS_getcpu macro defined, meaning the syscall +// actually exists within the build environment. We manually use the syscall +// rather than the libc wrapper given the wrapper for getcpu is only available +// in glibc 2.29 and later. +#if defined(__x86_64__) && defined(SYS_getcpu) // Set the CPU affinity for the child process, so that we ensure that if // the user specified a CPU the process should run on, the benchmarking // process is running on that CPU. @@ -413,11 +418,11 @@ class SubProcessFunctionExecutorImpl // Check (if assertions are enabled) that we are actually running on the // CPU that was specified by the user. [[maybe_unused]] unsigned int CurrentCPU; - assert(getcpu(&CurrentCPU, nullptr) == 0 && + assert(syscall(SYS_getcpu, &CurrentCPU, nullptr) == 0 && "Expected getcpu call to succeed."); assert(static_cast(CurrentCPU) == CPUToUse && "Expected current CPU to equal the CPU requested by the user"); -#endif // defined(__x86_64__) +#endif // defined(__x86_64__) && defined(SYS_getcpu) exit(ChildProcessExitCodeE::SetCPUAffinityFailed); } diff --git a/llvm/tools/llvm-reduce/TestRunner.cpp b/llvm/tools/llvm-reduce/TestRunner.cpp index 8a61aae64b9029..aac5c4a4fe7a68 100644 --- a/llvm/tools/llvm-reduce/TestRunner.cpp +++ b/llvm/tools/llvm-reduce/TestRunner.cpp @@ -13,17 +13,18 @@ using namespace llvm; -TestRunner::TestRunner(StringRef TestName, - const std::vector &TestArgs, +TestRunner::TestRunner(StringRef TestName, ArrayRef RawTestArgs, std::unique_ptr Program, std::unique_ptr TM, StringRef ToolName, StringRef OutputName, bool InputIsBitcode, bool OutputBitcode) - : TestName(TestName), ToolName(ToolName), TestArgs(TestArgs), - Program(std::move(Program)), TM(std::move(TM)), - OutputFilename(OutputName), InputIsBitcode(InputIsBitcode), - EmitBitcode(OutputBitcode) { + : TestName(TestName), ToolName(ToolName), Program(std::move(Program)), + TM(std::move(TM)), OutputFilename(OutputName), + InputIsBitcode(InputIsBitcode), EmitBitcode(OutputBitcode) { assert(this->Program && "Initialized with null program?"); + + TestArgs.push_back(TestName); // argv[0] + TestArgs.append(RawTestArgs.begin(), RawTestArgs.end()); } static constexpr std::array, 3> DefaultRedirects = { @@ -33,18 +34,13 @@ static constexpr std::array, 3> NullRedirects; /// Runs the interestingness test, passes file to be tested as first argument /// and other specified test arguments after that. int TestRunner::run(StringRef Filename) const { - std::vector ProgramArgs; - ProgramArgs.push_back(TestName); - - for (const auto &Arg : TestArgs) - ProgramArgs.push_back(Arg); - - ProgramArgs.push_back(Filename); + SmallVector ExecArgs(TestArgs); + ExecArgs.push_back(Filename); std::string ErrMsg; int Result = - sys::ExecuteAndWait(TestName, ProgramArgs, /*Env=*/std::nullopt, + sys::ExecuteAndWait(TestName, ExecArgs, /*Env=*/std::nullopt, Verbose ? DefaultRedirects : NullRedirects, /*SecondsToWait=*/0, /*MemoryLimit=*/0, &ErrMsg); diff --git a/llvm/tools/llvm-reduce/TestRunner.h b/llvm/tools/llvm-reduce/TestRunner.h index 16d3dcd244a833..930c3248ff1056 100644 --- a/llvm/tools/llvm-reduce/TestRunner.h +++ b/llvm/tools/llvm-reduce/TestRunner.h @@ -25,7 +25,7 @@ namespace llvm { // respective filename. class TestRunner { public: - TestRunner(StringRef TestName, const std::vector &TestArgs, + TestRunner(StringRef TestName, ArrayRef TestArgs, std::unique_ptr Program, std::unique_ptr TM, StringRef ToolName, StringRef OutputFilename, bool InputIsBitcode, bool OutputBitcode); @@ -55,7 +55,7 @@ class TestRunner { private: StringRef TestName; StringRef ToolName; - const std::vector &TestArgs; + SmallVector TestArgs; std::unique_ptr Program; std::unique_ptr TM; StringRef OutputFilename; diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index 64e3b9c44cf8bb..d5239f21147cdb 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -1142,12 +1142,12 @@ TEST_F(IRBuilderTest, InsertExtractElement) { EXPECT_EQ(Elt2, X2); } -TEST_F(IRBuilderTest, CreateGlobalStringPtr) { +TEST_F(IRBuilderTest, CreateGlobalString) { IRBuilder<> Builder(BB); - auto String1a = Builder.CreateGlobalStringPtr("TestString", "String1a"); - auto String1b = Builder.CreateGlobalStringPtr("TestString", "String1b", 0); - auto String2 = Builder.CreateGlobalStringPtr("TestString", "String2", 1); + auto String1a = Builder.CreateGlobalString("TestString", "String1a"); + auto String1b = Builder.CreateGlobalString("TestString", "String1b", 0); + auto String2 = Builder.CreateGlobalString("TestString", "String2", 1); auto String3 = Builder.CreateGlobalString("TestString", "String3", 2); EXPECT_TRUE(String1a->getType()->getPointerAddressSpace() == 0); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index bd6a4c29ebbb62..ae143fd5b74ea1 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" +#include "llvm/SandboxIR/Utils.h" #include "llvm/Support/SourceMgr.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" @@ -1179,6 +1180,24 @@ define ptr @foo() { EXPECT_EQ(PtrAuth->getWithSameSchema(&F), PtrAuth); } +TEST_F(SandboxIRTest, ConstantExpr) { + parseIR(C, R"IR( +define i32 @foo() { + ret i32 ptrtoint (ptr @foo to i32) +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *Ret = cast(&*It++); + // Check classof(), creation. + [[maybe_unused]] auto *ConstExpr = + cast(Ret->getReturnValue()); +} + TEST_F(SandboxIRTest, BlockAddress) { parseIR(C, R"IR( define void @foo(ptr %ptr) { @@ -1373,6 +1392,8 @@ OperandNo: 0 EXPECT_TRUE(I0->hasNUses(1u)); EXPECT_FALSE(I0->hasNUses(2u)); + // Check Value.getExpectedType + // Check User.setOperand(). Ret->setOperand(0, Arg0); EXPECT_EQ(Ret->getOperand(0), Arg0); @@ -1436,7 +1457,6 @@ define i32 @foo(i32 %arg0, i32 %arg1) { Replaced = Ret->replaceUsesOfWith(I0, Arg0); EXPECT_TRUE(Replaced); EXPECT_EQ(Ret->getOperand(0), Arg0); - // Check RAUW on constant. auto *Glob0 = cast(I1->getOperand(0)); auto *Glob1 = cast(I2->getOperand(0)); @@ -1445,6 +1465,68 @@ define i32 @foo(i32 %arg0, i32 %arg1) { EXPECT_EQ(Glob0->getOperand(0), Glob1); } +TEST_F(SandboxIRTest, GetExpected) { + parseIR(C, R"IR( +define float @foo(float %v, ptr %ptr) { + %add = fadd float %v, %v + store float %v, ptr %ptr + ret float %v +} +define void @bar(float %v, ptr %ptr) { + ret void +} +)IR"); + llvm::Function &Foo = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + Ctx.createFunction(&Foo); + auto *FooBB = cast(Ctx.getValue(&*Foo.begin())); + auto FooIt = FooBB->begin(); + auto Add = cast(&*FooIt++); + auto *S0 = cast(&*FooIt++); + auto *RetF = cast(&*FooIt++); + // getExpectedValue + EXPECT_EQ(sandboxir::Utils::getExpectedValue(Add), Add); + EXPECT_EQ(sandboxir::Utils::getExpectedValue(S0), + cast(S0)->getValueOperand()); + EXPECT_EQ(sandboxir::Utils::getExpectedValue(RetF), + cast(RetF)->getReturnValue()); + // getExpectedType + EXPECT_EQ(sandboxir::Utils::getExpectedType(Add), Add->getType()); + EXPECT_EQ(sandboxir::Utils::getExpectedType(S0), + cast(S0)->getValueOperand()->getType()); + EXPECT_EQ(sandboxir::Utils::getExpectedType(RetF), + cast(RetF)->getReturnValue()->getType()); + + // getExpectedValue for void returns + llvm::Function &Bar = *M->getFunction("bar"); + Ctx.createFunction(&Bar); + auto *BarBB = cast(Ctx.getValue(&*Bar.begin())); + auto BarIt = BarBB->begin(); + auto *RetV = cast(&*BarIt++); + EXPECT_EQ(sandboxir::Utils::getExpectedValue(RetV), nullptr); +} + +TEST_F(SandboxIRTest, GetNumBits) { + parseIR(C, R"IR( +define void @foo(float %arg0, double %arg1, i8 %arg2, i64 %arg3) { +bb0: + ret void +} +)IR"); + llvm::Function &Foo = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(&Foo); + const DataLayout &DL = M->getDataLayout(); + // getNumBits for scalars + EXPECT_EQ(sandboxir::Utils::getNumBits(F->getArg(0), DL), + DL.getTypeSizeInBits(Type::getFloatTy(C))); + EXPECT_EQ(sandboxir::Utils::getNumBits(F->getArg(1), DL), + DL.getTypeSizeInBits(Type::getDoubleTy(C))); + EXPECT_EQ(sandboxir::Utils::getNumBits(F->getArg(2), DL), 8u); + EXPECT_EQ(sandboxir::Utils::getNumBits(F->getArg(3), DL), 64u); +} + TEST_F(SandboxIRTest, RAUW_RUWIf) { parseIR(C, R"IR( define void @foo(ptr %ptr) { diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py index 16f3e618770b20..b413c253e39757 100755 --- a/llvm/utils/update_test_checks.py +++ b/llvm/utils/update_test_checks.py @@ -123,7 +123,13 @@ def main(): common.warn("Skipping unparsable RUN line: " + l) continue - commands = [cmd.strip() for cmd in l.split("|")] + cropped_content = l + if "%if" in l: + match = re.search(r"%{\s*(.*?)\s*%}", l) + if match: + cropped_content = match.group(1) + + commands = [cmd.strip() for cmd in cropped_content.split("|")] assert len(commands) >= 2 preprocess_cmd = None if len(commands) > 2: diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h index 6be5548fdb60ef..8ff4d4ec67b9fd 100644 --- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h +++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h @@ -17,11 +17,13 @@ namespace mlir { /// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func` or -/// `f32ApproxFunc` depending on the element type and the fastMathFlag of that -/// Op. The function declaration is added in case it was not added before. +/// `f32ApproxFunc` or `f16Func` depending on the element type and the +/// fastMathFlag of that Op. The function declaration is added in case it was +/// not added before. /// -/// If the input values are of f16 type, the value is first casted to f32, the -/// function called and then the result casted back. +/// If the input values are of bf16 type (or f16 type if f16Func is empty), the +/// value is first casted to f32, the function called and then the result casted +/// back. /// /// Example with NVVM: /// %exp_f32 = math.exp %arg_f32 : f32 @@ -41,9 +43,10 @@ template struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { public: explicit OpToFuncCallLowering(LLVMTypeConverter &lowering, StringRef f32Func, - StringRef f64Func, StringRef f32ApproxFunc) + StringRef f64Func, StringRef f32ApproxFunc, + StringRef f16Func) : ConvertOpToLLVMPattern(lowering), f32Func(f32Func), - f64Func(f64Func), f32ApproxFunc(f32ApproxFunc) {} + f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func) {} LogicalResult matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor, @@ -89,7 +92,11 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { private: Value maybeCast(Value operand, PatternRewriter &rewriter) const { Type type = operand.getType(); - if (!isa(type)) + if (!isa(type)) + return operand; + + // if there's a f16 function, no need to cast f16 values + if (!f16Func.empty() && isa(type)) return operand; return rewriter.create( @@ -102,6 +109,8 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { } StringRef getFunctionName(Type type, arith::FastMathFlags flag) const { + if (isa(type)) + return f16Func; if (isa(type)) { if (((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) && !f32ApproxFunc.empty()) @@ -130,6 +139,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { const std::string f32Func; const std::string f64Func; const std::string f32ApproxFunc; + const std::string f16Func; }; } // namespace mlir diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 4be330b0bb26bb..2b91a6c28c05e8 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -335,11 +335,11 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) { template static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, - StringRef f64Func, - StringRef f32ApproxFunc = "") { + StringRef f64Func, StringRef f32ApproxFunc = "", + StringRef f16Func = "") { patterns.add>(converter); patterns.add>(converter, f32Func, f64Func, - f32ApproxFunc); + f32ApproxFunc, f16Func); } void mlir::populateGpuSubgroupReduceOpLoweringPattern( diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index fc3e1fc4f9d0c9..482c9e2c2d0017 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -334,10 +334,9 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { target.addIllegalOp(); - // These ops are legal for f16 and f32 type. + // These ops are legal for f32 type. target.addDynamicallyLegalOp([](Operation *op) { - return any_of(op->getOperandTypes(), - llvm::IsaPred); + return any_of(op->getOperandTypes(), llvm::IsaPred); }); // TODO: Remove once we support replacing non-root ops. target.addLegalOp(); @@ -346,9 +345,11 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { template static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, - StringRef f64Func) { + StringRef f64Func, StringRef f32ApproxFunc, + StringRef f16Func) { patterns.add>(converter); - patterns.add>(converter, f32Func, f64Func); + patterns.add>(converter, f32Func, f32ApproxFunc, + f16Func); } void mlir::populateGpuToROCDLConversionPatterns( diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp index b3b4d81e7ffa5b..8330713ea66e5c 100644 --- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp +++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp @@ -38,17 +38,17 @@ using namespace mlir; template static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, - StringRef f64Func, + StringRef f64Func, StringRef f16Func, StringRef f32ApproxFunc = "") { patterns.add>(converter); patterns.add>(converter, f32Func, f64Func, - f32ApproxFunc); + f32ApproxFunc, f16Func); } void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { // Handled by mathToLLVM: math::AbsIOp - // Handled by mathToLLVM: math::AbsFIOp + // Handled by mathToLLVM: math::AbsFOp // Handled by mathToLLVM: math::CopySignOp // Handled by mathToLLVM: math::CountLeadingZerosOp // Handled by mathToLLVM: math::CountTrailingZerosOp @@ -63,59 +63,61 @@ void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, // Handled by mathToLLVM: math::SqrtOp // Handled by mathToLLVM: math::TruncOp populateOpPatterns(converter, patterns, "__ocml_acos_f32", - "__ocml_acos_f64"); + "__ocml_acos_f64", "__ocml_acos_f16"); populateOpPatterns(converter, patterns, "__ocml_acosh_f32", - "__ocml_acosh_f64"); + "__ocml_acosh_f64", "__ocml_acosh_f16"); populateOpPatterns(converter, patterns, "__ocml_asin_f32", - "__ocml_asin_f64"); + "__ocml_asin_f64", "__ocml_asin_f16"); populateOpPatterns(converter, patterns, "__ocml_asinh_f32", - "__ocml_asinh_f64"); + "__ocml_asinh_f64", "__ocml_asinh_f16"); populateOpPatterns(converter, patterns, "__ocml_atan_f32", - "__ocml_atan_f64"); + "__ocml_atan_f64", "__ocml_atan_f16"); populateOpPatterns(converter, patterns, "__ocml_atanh_f32", - "__ocml_atanh_f64"); + "__ocml_atanh_f64", "__ocml_atanh_f16"); populateOpPatterns(converter, patterns, "__ocml_atan2_f32", - "__ocml_atan2_f64"); + "__ocml_atan2_f64", "__ocml_atan2_f16"); populateOpPatterns(converter, patterns, "__ocml_cbrt_f32", - "__ocml_cbrt_f64"); + "__ocml_cbrt_f64", "__ocml_cbrt_f16"); populateOpPatterns(converter, patterns, "__ocml_ceil_f32", - "__ocml_ceil_f64"); + "__ocml_ceil_f64", "__ocml_ceil_f16"); populateOpPatterns(converter, patterns, "__ocml_cos_f32", - "__ocml_cos_f64"); + "__ocml_cos_f64", "__ocml_cos_f16"); populateOpPatterns(converter, patterns, "__ocml_cosh_f32", - "__ocml_cosh_f64"); + "__ocml_cosh_f64", "__ocml_cosh_f16"); populateOpPatterns(converter, patterns, "__ocml_sinh_f32", - "__ocml_sinh_f64"); - populateOpPatterns(converter, patterns, "", "__ocml_exp_f64"); + "__ocml_sinh_f64", "__ocml_sinh_f16"); + populateOpPatterns(converter, patterns, "", "__ocml_exp_f64", + "__ocml_exp_f16"); populateOpPatterns(converter, patterns, "__ocml_exp2_f32", - "__ocml_exp2_f64"); + "__ocml_exp2_f64", "__ocml_exp2_f16"); populateOpPatterns(converter, patterns, "__ocml_expm1_f32", - "__ocml_expm1_f64"); + "__ocml_expm1_f64", "__ocml_expm1_f16"); populateOpPatterns(converter, patterns, "__ocml_floor_f32", - "__ocml_floor_f64"); - populateOpPatterns(converter, patterns, "", "__ocml_log_f64"); + "__ocml_floor_f64", "__ocml_floor_f16"); + populateOpPatterns(converter, patterns, "", "__ocml_log_f64", + "__ocml_log_f16"); populateOpPatterns(converter, patterns, "__ocml_log10_f32", - "__ocml_log10_f64"); + "__ocml_log10_f64", "__ocml_log10_f16"); populateOpPatterns(converter, patterns, "__ocml_log1p_f32", - "__ocml_log1p_f64"); + "__ocml_log1p_f64", "__ocml_log1p_f16"); populateOpPatterns(converter, patterns, "__ocml_log2_f32", - "__ocml_log2_f64"); + "__ocml_log2_f64", "__ocml_log2_f16"); populateOpPatterns(converter, patterns, "__ocml_pow_f32", - "__ocml_pow_f64"); + "__ocml_pow_f64", "__ocml_pow_f16"); populateOpPatterns(converter, patterns, "__ocml_rsqrt_f32", - "__ocml_rsqrt_f64"); + "__ocml_rsqrt_f64", "__ocml_rsqrt_f16"); populateOpPatterns(converter, patterns, "__ocml_sin_f32", - "__ocml_sin_f64"); + "__ocml_sin_f64", "__ocml_sin_f16"); populateOpPatterns(converter, patterns, "__ocml_tanh_f32", - "__ocml_tanh_f64"); + "__ocml_tanh_f64", "__ocml_tanh_f16"); populateOpPatterns(converter, patterns, "__ocml_tan_f32", - "__ocml_tan_f64"); + "__ocml_tan_f64", "__ocml_tan_f16"); populateOpPatterns(converter, patterns, "__ocml_erf_f32", - "__ocml_erf_f64"); + "__ocml_erf_f64", "__ocml_erf_f16"); // Single arith pattern that needs a ROCDL call, probably not // worth creating a separate pass for it. populateOpPatterns(converter, patterns, "__ocml_fmod_f32", - "__ocml_fmod_f64"); + "__ocml_fmod_f64", "__ocml_fmod_f16"); } namespace { diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index eb065cbab86789..0d3e9f4ea2bf39 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -162,11 +162,12 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_exp_f16(f16) -> f16 // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 // CHECK-LABEL: func @gpu_exp func.func @gpu_exp(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { %result16 = math.exp %arg_f16 : f16 - // CHECK: llvm.intr.exp(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.call @__ocml_exp_f16(%{{.*}}) : (f16) -> f16 %result32 = math.exp %arg_f32 : f32 // CHECK: llvm.intr.exp(%{{.*}}) : (f32) -> f32 %result64 = math.exp %arg_f64 : f64 @@ -178,11 +179,12 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_log_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log func.func @gpu_log(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { %result16 = math.log %arg_f16 : f16 - // CHECK: llvm.intr.log(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.call @__ocml_log_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log %arg_f32 : f32 // CHECK: llvm.intr.log(%{{.*}}) : (f32) -> f32 %result64 = math.log %arg_f64 : f64 @@ -194,108 +196,113 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_cbrt_f16(f16) -> f16 // CHECK: llvm.func @__ocml_cbrt_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cbrt_f64(f64) -> f64 // CHECK-LABEL: func @gpu_cbrt - func.func @gpu_cbrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_cbrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.cbrt %arg_f16 : f16 + // CHECK: llvm.call @__ocml_cbrt_f16(%{{.*}}) : (f16) -> f16 %result32 = math.cbrt %arg_f32 : f32 // CHECK: llvm.call @__ocml_cbrt_f32(%{{.*}}) : (f32) -> f32 %result64 = math.cbrt %arg_f64 : f64 // CHECK: llvm.call @__ocml_cbrt_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_ceil_f16(f16) -> f16 // CHECK: llvm.func @__ocml_ceil_f32(f32) -> f32 // CHECK: llvm.func @__ocml_ceil_f64(f64) -> f64 // CHECK-LABEL: func @gpu_ceil - func.func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_ceil(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.ceil %arg_f16 : f16 + // CHECK: llvm.call @__ocml_ceil_f16(%{{.*}}) : (f16) -> f16 %result32 = math.ceil %arg_f32 : f32 // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (f32) -> f32 %result64 = math.ceil %arg_f64 : f64 // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_floor_f16(f16) -> f16 // CHECK: llvm.func @__ocml_floor_f32(f32) -> f32 // CHECK: llvm.func @__ocml_floor_f64(f64) -> f64 // CHECK-LABEL: func @gpu_floor - func.func @gpu_floor(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_floor(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.floor %arg_f16 : f16 + // CHECK: llvm.call @__ocml_floor_f16(%{{.*}}) : (f16) -> f16 %result32 = math.floor %arg_f32 : f32 // CHECK: llvm.call @__ocml_floor_f32(%{{.*}}) : (f32) -> f32 %result64 = math.floor %arg_f64 : f64 // CHECK: llvm.call @__ocml_floor_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_cos_f16(f16) -> f16 // CHECK: llvm.func @__ocml_cos_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cos_f64(f64) -> f64 // CHECK-LABEL: func @gpu_cos - func.func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_cos(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.cos %arg_f16 : f16 + // CHECK: llvm.call @__ocml_cos_f16(%{{.*}}) : (f16) -> f16 %result32 = math.cos %arg_f32 : f32 // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (f32) -> f32 %result64 = math.cos %arg_f64 : f64 // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 - } -} - -// ----- - -gpu.module @test_module { - // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 - // CHECK-LABEL: func @gpu_exp - func.func @gpu_exp(%arg_f64 : f64) -> (f64) { - %result64 = math.exp %arg_f64 : f64 - // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 - func.return %result64 : f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_exp2_f16(f16) -> f16 // CHECK: llvm.func @__ocml_exp2_f32(f32) -> f32 // CHECK: llvm.func @__ocml_exp2_f64(f64) -> f64 // CHECK-LABEL: func @gpu_exp2 - func.func @gpu_exp2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_exp2(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.exp2 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_exp2_f16(%{{.*}}) : (f16) -> f16 %exp2_f32 = math.exp2 %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp2_f32(%{{.*}}) : (f32) -> f32 %result32 = math.exp2 %exp2_f32 : f32 // CHECK: llvm.call @__ocml_exp2_f32(%{{.*}}) : (f32) -> f32 %result64 = math.exp2 %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp2_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- + // Test that we handled properly operation with SymbolTable other than module op gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32 // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64 // CHECK-LABEL: func @gpu_sin - func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %sin_f32 = math.sin %arg_f32 : f32 + func.func @gpu_sin(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + %result16 = math.sin %arg_f16 : f16 // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 - %result32 = math.sin %sin_f32 : f32 - // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 - %result64 = math.sin %arg_f64 : f64 + %result32 = math.sin %arg_f32 : f32 // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + %result64 = math.sin %arg_f64 : f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } "test.finish" () : () -> () }) : () -> () @@ -304,89 +311,102 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_expm1_f16(f16) -> f16 // CHECK: llvm.func @__ocml_expm1_f32(f32) -> f32 // CHECK: llvm.func @__ocml_expm1_f64(f64) -> f64 // CHECK-LABEL: func @gpu_expm1 - func.func @gpu_expm1(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_expm1(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.expm1 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_expm1_f16(%{{.*}}) : (f16) -> f16 %expm1_f32 = math.expm1 %arg_f32 : f32 // CHECK: llvm.call @__ocml_expm1_f32(%{{.*}}) : (f32) -> f32 %result32 = math.expm1 %expm1_f32 : f32 // CHECK: llvm.call @__ocml_expm1_f32(%{{.*}}) : (f32) -> f32 %result64 = math.expm1 %arg_f64 : f64 // CHECK: llvm.call @__ocml_expm1_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_log_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log - func.func @gpu_log(%arg_f64 : f64) -> (f64) { + func.func @gpu_log(%arg_f16 : f16, %arg_f64 : f64) -> (f16, f64) { + %result16 = math.log %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log_f16(%{{.*}}) : (f16) -> f16 %result64 = math.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64 - func.return %result64 : f64 + func.return %result16, %result64 : f16, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_log1p_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log1p_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log1p_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log1p - func.func @gpu_log1p(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_log1p(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.log1p %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log1p_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log1p %arg_f32 : f32 // CHECK: llvm.call @__ocml_log1p_f32(%{{.*}}) : (f32) -> f32 %result64 = math.log1p %arg_f64 : f64 // CHECK: llvm.call @__ocml_log1p_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_log10_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log10_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log10_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log10 - func.func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_log10(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.log10 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log10_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log10 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (f32) -> f32 %result64 = math.log10 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_log2_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log2_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log2_f64(f64) -> f64 // CHECK-LABEL: func @gpu_log2 - func.func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_log2(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.log2 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log2_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log2 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (f32) -> f32 %result64 = math.log2 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_rsqrt_f16(f16) -> f16 // CHECK: llvm.func @__ocml_rsqrt_f32(f32) -> f32 // CHECK: llvm.func @__ocml_rsqrt_f64(f64) -> f64 // CHECK-LABEL: func @gpu_rsqrt - func.func @gpu_rsqrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) - -> (f16, f32, f64) { + func.func @gpu_rsqrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { %result16 = math.rsqrt %arg_f16 : f16 - // CHECK: llvm.fpext %{{.*}} : f16 to f32 - // CHECK-NEXT: llvm.call @__ocml_rsqrt_f32(%{{.*}}) : (f32) -> f32 - // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to f16 + // CHECK: llvm.call @__ocml_rsqrt_f16(%{{.*}}) : (f16) -> f16 %result32 = math.rsqrt %arg_f32 : f32 // CHECK: llvm.call @__ocml_rsqrt_f32(%{{.*}}) : (f32) -> f32 %result64 = math.rsqrt %arg_f64 : f64 @@ -398,90 +418,108 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_tan_f16(f16) -> f16 // CHECK: llvm.func @__ocml_tan_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tan_f64(f64) -> f64 // CHECK-LABEL: func @gpu_tan - func.func @gpu_tan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.tan %arg_f16 : f16 + // CHECK: llvm.call @__ocml_tan_f16(%{{.*}}) : (f16) -> f16 %result32 = math.tan %arg_f32 : f32 // CHECK: llvm.call @__ocml_tan_f32(%{{.*}}) : (f32) -> f32 %result64 = math.tan %arg_f64 : f64 // CHECK: llvm.call @__ocml_tan_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_tanh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_tanh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tanh_f64(f64) -> f64 // CHECK-LABEL: func @gpu_tanh - func.func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_tanh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.tanh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_tanh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.tanh %arg_f32 : f32 // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.tanh %arg_f64 : f64 // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_atan_f16(f16) -> f16 // CHECK: llvm.func @__ocml_atan_f32(f32) -> f32 // CHECK: llvm.func @__ocml_atan_f64(f64) -> f64 // CHECK-LABEL: func @gpu_atan - func.func @gpu_atan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_atan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.atan %arg_f16 : f16 + // CHECK: llvm.call @__ocml_atan_f16(%{{.*}}) : (f16) -> f16 %result32 = math.atan %arg_f32 : f32 // CHECK: llvm.call @__ocml_atan_f32(%{{.*}}) : (f32) -> f32 %result64 = math.atan %arg_f64 : f64 // CHECK: llvm.call @__ocml_atan_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_atan2_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_atan2_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_atan2_f64(f64, f64) -> f64 // CHECK-LABEL: func @gpu_atan2 - func.func @gpu_atan2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_atan2(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.atan2 %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_atan2_f16(%{{.*}}) : (f16, f16) -> f16 %result32 = math.atan2 %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_atan2_f32(%{{.*}}) : (f32, f32) -> f32 %result64 = math.atan2 %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_atan2_f64(%{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_pow_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_pow_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_pow_f64(f64, f64) -> f64 // CHECK-LABEL: func @gpu_pow - func.func @gpu_pow(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_pow(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.powf %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_pow_f16(%{{.*}}, %{{.*}}) : (f16, f16) -> f16 %result32 = math.powf %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_pow_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 %result64 = math.powf %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_pow_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_erf_f16(f16) -> f16 // CHECK: llvm.func @__ocml_erf_f32(f32) -> f32 // CHECK: llvm.func @__ocml_erf_f64(f64) -> f64 // CHECK-LABEL: func @gpu_erf - func.func @gpu_erf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_erf(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.erf %arg_f16 : f16 + // CHECK: llvm.call @__ocml_erf_f16(%{{.*}}) : (f16) -> f16 %result32 = math.erf %arg_f32 : f32 // CHECK: llvm.call @__ocml_erf_f32(%{{.*}}) : (f32) -> f32 %result64 = math.erf %arg_f64 : f64 // CHECK: llvm.call @__ocml_erf_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } @@ -543,9 +581,9 @@ gpu.module @test_module { // ----- gpu.module @module { -// CHECK-LABEL: @spirv_exp +// CHECK-LABEL: @spirv_sin // CHECK: llvm.call @__ocml_sin_f32 - spirv.func @spirv_exp(%arg0: vector<4xf32>) -> vector<4xf32> "None" { + spirv.func @spirv_sin(%arg0: vector<4xf32>) -> vector<4xf32> "None" { %0 = math.sin %arg0 : vector<4xf32> spirv.ReturnValue %0 : vector<4xf32> } @@ -602,15 +640,18 @@ gpu.module @test_module { // ----- gpu.module @test_module { + // CHECK: llvm.func @__ocml_fmod_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_fmod_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_fmod_f64(f64, f64) -> f64 // CHECK-LABEL: func @gpu_fmod - func.func @gpu_fmod(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @gpu_fmod(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = arith.remf %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_fmod_f16(%{{.*}}, %{{.*}}) : (f16, f16) -> f16 %result32 = arith.remf %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_fmod_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 %result64 = arith.remf %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_fmod_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir index 19d89e03a7f483..ddd96bf797e6e7 100644 --- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir +++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir @@ -1,399 +1,483 @@ // RUN: mlir-opt %s -convert-math-to-rocdl -split-input-file | FileCheck %s module @test_module { + // CHECK: llvm.func @__ocml_fmod_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_fmod_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_fmod_f64(f64, f64) -> f64 // CHECK-LABEL: func @arith_remf - func.func @arith_remf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @arith_remf(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = arith.remf %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_fmod_f16(%{{.*}}, %{{.*}}) : (f16, f16) -> f16 %result32 = arith.remf %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_fmod_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 %result64 = arith.remf %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_fmod_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_acos_f16(f16) -> f16 // CHECK: llvm.func @__ocml_acos_f32(f32) -> f32 // CHECK: llvm.func @__ocml_acos_f64(f64) -> f64 // CHECK-LABEL: func @math_acos - func.func @math_acos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_acos(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.acos %arg_f16 : f16 + // CHECK: llvm.call @__ocml_acos_f16(%{{.*}}) : (f16) -> f16 %result32 = math.acos %arg_f32 : f32 // CHECK: llvm.call @__ocml_acos_f32(%{{.*}}) : (f32) -> f32 %result64 = math.acos %arg_f64 : f64 // CHECK: llvm.call @__ocml_acos_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_acosh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_acosh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_acosh_f64(f64) -> f64 // CHECK-LABEL: func @math_acosh - func.func @math_acosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_acosh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.acosh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_acosh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.acosh %arg_f32 : f32 // CHECK: llvm.call @__ocml_acosh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.acosh %arg_f64 : f64 // CHECK: llvm.call @__ocml_acosh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_asin_f16(f16) -> f16 // CHECK: llvm.func @__ocml_asin_f32(f32) -> f32 // CHECK: llvm.func @__ocml_asin_f64(f64) -> f64 // CHECK-LABEL: func @math_asin - func.func @math_asin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_asin(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.asin %arg_f16 : f16 + // CHECK: llvm.call @__ocml_asin_f16(%{{.*}}) : (f16) -> f16 %result32 = math.asin %arg_f32 : f32 // CHECK: llvm.call @__ocml_asin_f32(%{{.*}}) : (f32) -> f32 %result64 = math.asin %arg_f64 : f64 // CHECK: llvm.call @__ocml_asin_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_asinh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_asinh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_asinh_f64(f64) -> f64 // CHECK-LABEL: func @math_asinh - func.func @math_asinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_asinh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.asinh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_asinh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.asinh %arg_f32 : f32 // CHECK: llvm.call @__ocml_asinh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.asinh %arg_f64 : f64 // CHECK: llvm.call @__ocml_asinh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_atan_f16(f16) -> f16 // CHECK: llvm.func @__ocml_atan_f32(f32) -> f32 // CHECK: llvm.func @__ocml_atan_f64(f64) -> f64 // CHECK-LABEL: func @math_atan - func.func @math_atan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_atan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.atan %arg_f16 : f16 + // CHECK: llvm.call @__ocml_atan_f16(%{{.*}}) : (f16) -> f16 %result32 = math.atan %arg_f32 : f32 // CHECK: llvm.call @__ocml_atan_f32(%{{.*}}) : (f32) -> f32 %result64 = math.atan %arg_f64 : f64 // CHECK: llvm.call @__ocml_atan_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_atanh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_atanh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_atanh_f64(f64) -> f64 // CHECK-LABEL: func @math_atanh - func.func @math_atanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_atanh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.atanh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_atanh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.atanh %arg_f32 : f32 // CHECK: llvm.call @__ocml_atanh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.atanh %arg_f64 : f64 // CHECK: llvm.call @__ocml_atanh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_atan2_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_atan2_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_atan2_f64(f64, f64) -> f64 // CHECK-LABEL: func @math_atan2 - func.func @math_atan2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_atan2(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.atan2 %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_atan2_f16(%{{.*}}, %{{.*}}) : (f16, f16) -> f16 %result32 = math.atan2 %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_atan2_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 %result64 = math.atan2 %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_atan2_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_cbrt_f16(f16) -> f16 // CHECK: llvm.func @__ocml_cbrt_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cbrt_f64(f64) -> f64 // CHECK-LABEL: func @math_cbrt - func.func @math_cbrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_cbrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.cbrt %arg_f16 : f16 + // CHECK: llvm.call @__ocml_cbrt_f16(%{{.*}}) : (f16) -> f16 %result32 = math.cbrt %arg_f32 : f32 // CHECK: llvm.call @__ocml_cbrt_f32(%{{.*}}) : (f32) -> f32 %result64 = math.cbrt %arg_f64 : f64 // CHECK: llvm.call @__ocml_cbrt_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_ceil_f16(f16) -> f16 // CHECK: llvm.func @__ocml_ceil_f32(f32) -> f32 // CHECK: llvm.func @__ocml_ceil_f64(f64) -> f64 // CHECK-LABEL: func @math_ceil - func.func @math_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_ceil(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.ceil %arg_f16 : f16 + // CHECK: llvm.call @__ocml_ceil_f16(%{{.*}}) : (f16) -> f16 %result32 = math.ceil %arg_f32 : f32 // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (f32) -> f32 %result64 = math.ceil %arg_f64 : f64 // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_cos_f16(f16) -> f16 // CHECK: llvm.func @__ocml_cos_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cos_f64(f64) -> f64 // CHECK-LABEL: func @math_cos - func.func @math_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_cos(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.cos %arg_f16 : f16 + // CHECK: llvm.call @__ocml_cos_f16(%{{.*}}) : (f16) -> f16 %result32 = math.cos %arg_f32 : f32 // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (f32) -> f32 %result64 = math.cos %arg_f64 : f64 // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_cosh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_cosh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_cosh_f64(f64) -> f64 // CHECK-LABEL: func @math_cosh - func.func @math_cosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_cosh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.cosh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_cosh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.cosh %arg_f32 : f32 // CHECK: llvm.call @__ocml_cosh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.cosh %arg_f64 : f64 // CHECK: llvm.call @__ocml_cosh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_sinh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_sinh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_sinh_f64(f64) -> f64 // CHECK-LABEL: func @math_sinh - func.func @math_sinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_sinh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.sinh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_sinh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.sinh %arg_f32 : f32 // CHECK: llvm.call @__ocml_sinh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.sinh %arg_f64 : f64 // CHECK: llvm.call @__ocml_sinh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_exp_f16(f16) -> f16 // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64 // CHECK-LABEL: func @math_exp - func.func @math_exp(%arg_f64 : f64) -> (f64) { + func.func @math_exp(%arg_f16 : f16, %arg_f64 : f64) -> (f16, f64) { + %result16 = math.exp %arg_f16 : f16 + // CHECK: llvm.call @__ocml_exp_f16(%{{.*}}) : (f16) -> f16 %result64 = math.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64 - func.return %result64 : f64 + func.return %result16, %result64 : f16, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_exp2_f16(f16) -> f16 // CHECK: llvm.func @__ocml_exp2_f32(f32) -> f32 // CHECK: llvm.func @__ocml_exp2_f64(f64) -> f64 // CHECK-LABEL: func @math_exp2 - func.func @math_exp2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_exp2(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.exp2 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_exp2_f16(%{{.*}}) : (f16) -> f16 %result32 = math.exp2 %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp2_f32(%{{.*}}) : (f32) -> f32 %result64 = math.exp2 %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp2_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_expm1_f16(f16) -> f16 // CHECK: llvm.func @__ocml_expm1_f32(f32) -> f32 // CHECK: llvm.func @__ocml_expm1_f64(f64) -> f64 // CHECK-LABEL: func @math_expm1 - func.func @math_expm1(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_expm1(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.expm1 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_expm1_f16(%{{.*}}) : (f16) -> f16 %result32 = math.expm1 %arg_f32 : f32 // CHECK: llvm.call @__ocml_expm1_f32(%{{.*}}) : (f32) -> f32 %result64 = math.expm1 %arg_f64 : f64 // CHECK: llvm.call @__ocml_expm1_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_floor_f16(f16) -> f16 // CHECK: llvm.func @__ocml_floor_f32(f32) -> f32 // CHECK: llvm.func @__ocml_floor_f64(f64) -> f64 // CHECK-LABEL: func @math_floor - func.func @math_floor(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_floor(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.floor %arg_f16 : f16 + // CHECK: llvm.call @__ocml_floor_f16(%{{.*}}) : (f16) -> f16 %result32 = math.floor %arg_f32 : f32 // CHECK: llvm.call @__ocml_floor_f32(%{{.*}}) : (f32) -> f32 %result64 = math.floor %arg_f64 : f64 // CHECK: llvm.call @__ocml_floor_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_log_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log_f64(f64) -> f64 // CHECK-LABEL: func @math_log - func.func @math_log(%arg_f64 : f64) -> (f64) { + func.func @math_log(%arg_f16 : f16, %arg_f64 : f64) -> (f16, f64) { + %result16 = math.log %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log_f16(%{{.*}}) : (f16) -> f16 %result64 = math.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64 - func.return %result64 : f64 + func.return %result16, %result64 : f16, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_log10_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log10_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log10_f64(f64) -> f64 // CHECK-LABEL: func @math_log10 - func.func @math_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_log10(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.log10 %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log10_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log10 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (f32) -> f32 %result64 = math.log10 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_log1p_f16(f16) -> f16 // CHECK: llvm.func @__ocml_log1p_f32(f32) -> f32 // CHECK: llvm.func @__ocml_log1p_f64(f64) -> f64 // CHECK-LABEL: func @math_log1p - func.func @math_log1p(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_log1p(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.log1p %arg_f16 : f16 + // CHECK: llvm.call @__ocml_log1p_f16(%{{.*}}) : (f16) -> f16 %result32 = math.log1p %arg_f32 : f32 // CHECK: llvm.call @__ocml_log1p_f32(%{{.*}}) : (f32) -> f32 %result64 = math.log1p %arg_f64 : f64 // CHECK: llvm.call @__ocml_log1p_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_pow_f16(f16, f16) -> f16 // CHECK: llvm.func @__ocml_pow_f32(f32, f32) -> f32 // CHECK: llvm.func @__ocml_pow_f64(f64, f64) -> f64 // CHECK-LABEL: func @math_powf - func.func @math_powf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_powf(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.powf %arg_f16, %arg_f16 : f16 + // CHECK: llvm.call @__ocml_pow_f16(%{{.*}}, %{{.*}}) : (f16, f16) -> f16 %result32 = math.powf %arg_f32, %arg_f32 : f32 // CHECK: llvm.call @__ocml_pow_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 %result64 = math.powf %arg_f64, %arg_f64 : f64 // CHECK: llvm.call @__ocml_pow_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_rsqrt_f16(f16) -> f16 // CHECK: llvm.func @__ocml_rsqrt_f32(f32) -> f32 // CHECK: llvm.func @__ocml_rsqrt_f64(f64) -> f64 // CHECK-LABEL: func @math_rsqrt - func.func @math_rsqrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_rsqrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.rsqrt %arg_f16 : f16 + // CHECK: llvm.call @__ocml_rsqrt_f16(%{{.*}}) : (f16) -> f16 %result32 = math.rsqrt %arg_f32 : f32 // CHECK: llvm.call @__ocml_rsqrt_f32(%{{.*}}) : (f32) -> f32 %result64 = math.rsqrt %arg_f64 : f64 // CHECK: llvm.call @__ocml_rsqrt_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32 // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64 // CHECK-LABEL: func @math_sin - func.func @math_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_sin(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.sin %arg_f16 : f16 + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 %result32 = math.sin %arg_f32 : f32 // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 %result64 = math.sin %arg_f64 : f64 // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_tanh_f16(f16) -> f16 // CHECK: llvm.func @__ocml_tanh_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tanh_f64(f64) -> f64 // CHECK-LABEL: func @math_tanh - func.func @math_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_tanh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.tanh %arg_f16 : f16 + // CHECK: llvm.call @__ocml_tanh_f16(%{{.*}}) : (f16) -> f16 %result32 = math.tanh %arg_f32 : f32 // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (f32) -> f32 %result64 = math.tanh %arg_f64 : f64 // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_tan_f16(f16) -> f16 // CHECK: llvm.func @__ocml_tan_f32(f32) -> f32 // CHECK: llvm.func @__ocml_tan_f64(f64) -> f64 // CHECK-LABEL: func @math_tan - func.func @math_tan(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.tan %arg_f16 : f16 + // CHECK: llvm.call @__ocml_tan_f16(%{{.*}}) : (f16) -> f16 %result32 = math.tan %arg_f32 : f32 // CHECK: llvm.call @__ocml_tan_f32(%{{.*}}) : (f32) -> f32 %result64 = math.tan %arg_f64 : f64 // CHECK: llvm.call @__ocml_tan_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { + // CHECK: llvm.func @__ocml_erf_f16(f16) -> f16 // CHECK: llvm.func @__ocml_erf_f32(f32) -> f32 // CHECK: llvm.func @__ocml_erf_f64(f64) -> f64 // CHECK-LABEL: func @math_erf - func.func @math_erf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { + func.func @math_erf(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) { + %result16 = math.erf %arg_f16 : f16 + // CHECK: llvm.call @__ocml_erf_f16(%{{.*}}) : (f16) -> f16 %result32 = math.erf %arg_f32 : f32 // CHECK: llvm.call @__ocml_erf_f32(%{{.*}}) : (f32) -> f32 %result64 = math.erf %arg_f64 : f64 // CHECK: llvm.call @__ocml_erf_f64(%{{.*}}) : (f64) -> f64 - func.return %result32, %result64 : f32, f64 + func.return %result16, %result32, %result64 : f16, f32, f64 } } // ----- module @test_module { - // CHECK: llvm.func @__ocml_fmod_f32(f32, f32) -> f32 - // CHECK: llvm.func @__ocml_fmod_f64(f64, f64) -> f64 - // CHECK-LABEL: func @arith_remf - func.func @arith_remf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { - %result32 = arith.remf %arg_f32, %arg_f32 : f32 - // CHECK: llvm.call @__ocml_fmod_f32(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 - %result64 = arith.remf %arg_f64, %arg_f64 : f64 - // CHECK: llvm.call @__ocml_fmod_f64(%{{.*}}, %{{.*}}) : (f64, f64) -> f64 - func.return %result32, %result64 : f32, f64 + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 + // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32 + // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64 + // CHECK-LABEL: func @math_casting + func.func @math_casting(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64, %arg_bf16 : bf16) -> (f16, f32, f64, bf16) { + %resultf16 = math.sin %arg_f16 : f16 + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + %resultf32 = math.sin %arg_f32 : f32 + // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 + %resultf64 = math.sin %arg_f64 : f64 + // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64 + %resultbf16 = math.sin %arg_bf16 : bf16 + // CHECK: llvm.fpext %{{.*}} : bf16 to f32 + // CHECK-NEXT: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32 + // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to bf16 + func.return %resultf16, %resultf32, %resultf64, %resultbf16 : f16, f32, f64, bf16 } } - diff --git a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp index 8a29b0aaaf9c36..5355fe2f376394 100644 --- a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp +++ b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp @@ -23,7 +23,7 @@ llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder, // because CPU backends typically ignore the address space. For constant // strings as returned by getPrintableString, the format string should instead // directly spell out the string. - return Builder.CreateGlobalStringPtr(Str, "", 4); + return Builder.CreateGlobalString(Str, "", 4); } Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) { @@ -131,7 +131,7 @@ Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) { void RuntimeDebugBuilder::createPrintF(PollyIRBuilder &Builder, std::string Format, ArrayRef Values) { - Value *FormatString = Builder.CreateGlobalStringPtr(Format); + Value *FormatString = Builder.CreateGlobalString(Format); std::vector Arguments; Arguments.push_back(FormatString); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 0e39211988f891..a269cf861a5b74 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -804,6 +804,19 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_fputil_cast", + hdrs = ["src/__support/FPUtil/cast.h"], + deps = [ + ":__support_cpp_algorithm", + ":__support_cpp_type_traits", + ":__support_fputil_dyadic_float", + ":__support_fputil_fp_bits", + ":__support_macros_properties_types", + ":hdr_fenv_macros", + ], +) + libc_support_library( name = "__support_fputil_division_and_remainder_operations", hdrs = ["src/__support/FPUtil/DivisionAndRemainderOperations.h"], @@ -821,9 +834,12 @@ libc_support_library( hdrs = ["src/__support/FPUtil/except_value_utils.h"], deps = [ ":__support_cpp_optional", + ":__support_fputil_cast", ":__support_fputil_fenv_impl", ":__support_fputil_fp_bits", ":__support_fputil_rounding_mode", + ":__support_macros_properties_cpu_features", + ":__support_macros_properties_types", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel index adf4b235b1b5e3..ca21eaee504ff1 100644 --- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel @@ -46,6 +46,7 @@ libc_support_library( "//libc:__support_cpp_string_view", "//libc:__support_cpp_stringstream", "//libc:__support_cpp_type_traits", + "//libc:__support_fputil_cast", "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_fpbits_str", "//libc:__support_macros_config",