diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp index eae0d84721c43..cfc8ff653f169 100644 --- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp +++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp @@ -268,8 +268,7 @@ int main(int argc, const char **argv) { Error = false; llvm::sys::Mutex IndexMutex; // ExecutorConcurrency is a flag exposed by AllTUsExecution.h - llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency() - : ExecutorConcurrency); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency)); for (auto &Group : USRToBitcode) { Pool.async([&]() { std::vector> Infos; diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 52c1ceef74259..3fbf98970cceb 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -87,9 +87,8 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) { return CompletionItemKind::Text; case SK::Enum: return CompletionItemKind::Enum; - // FIXME(ioeric): use LSP struct instead of class when it is suppoted in the - // protocol. case SK::Struct: + return CompletionItemKind::Struct; case SK::Class: case SK::Protocol: case SK::Extension: @@ -102,18 +101,16 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) { case SK::Using: return CompletionItemKind::Reference; case SK::Function: - // FIXME(ioeric): this should probably be an operator. This should be fixed - // when `Operator` is support type in the protocol. case SK::ConversionFunction: return CompletionItemKind::Function; case SK::Variable: case SK::Parameter: + case SK::NonTypeTemplateParm: return CompletionItemKind::Variable; case SK::Field: return CompletionItemKind::Field; - // FIXME(ioeric): use LSP enum constant when it is supported in the protocol. case SK::EnumConstant: - return CompletionItemKind::Value; + return CompletionItemKind::EnumMember; case SK::InstanceMethod: case SK::ClassMethod: case SK::StaticMethod: @@ -125,6 +122,9 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) { return CompletionItemKind::Property; case SK::Constructor: return CompletionItemKind::Constructor; + case SK::TemplateTypeParm: + case SK::TemplateTemplateParm: + return CompletionItemKind::TypeParameter; } llvm_unreachable("Unhandled clang::index::SymbolKind."); } diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp index ae4c441a73b57..750df50c47777 100644 --- a/clang-tools-extra/clangd/Hover.cpp +++ b/clang-tools-extra/clangd/Hover.cpp @@ -115,15 +115,6 @@ std::string printDefinition(const Decl *D) { return Definition; } -void printParams(llvm::raw_ostream &OS, - const std::vector &Params) { - for (size_t I = 0, E = Params.size(); I != E; ++I) { - if (I) - OS << ", "; - OS << Params.at(I); - } -} - std::string printType(QualType QT, const PrintingPolicy &Policy) { // TypePrinter doesn't resolve decltypes, so resolve them here. // FIXME: This doesn't handle composite types that contain a decltype in them. @@ -133,6 +124,43 @@ std::string printType(QualType QT, const PrintingPolicy &Policy) { return QT.getAsString(Policy); } +std::string printType(const TemplateTypeParmDecl *TTP) { + std::string Res = TTP->wasDeclaredWithTypename() ? "typename" : "class"; + if (TTP->isParameterPack()) + Res += "..."; + return Res; +} + +std::string printType(const NonTypeTemplateParmDecl *NTTP, + const PrintingPolicy &PP) { + std::string Res = printType(NTTP->getType(), PP); + if (NTTP->isParameterPack()) + Res += "..."; + return Res; +} + +std::string printType(const TemplateTemplateParmDecl *TTP, + const PrintingPolicy &PP) { + std::string Res; + llvm::raw_string_ostream OS(Res); + OS << "template <"; + llvm::StringRef Sep = ""; + for (const Decl *Param : *TTP->getTemplateParameters()) { + OS << Sep; + Sep = ", "; + if (const auto *TTP = dyn_cast(Param)) + OS << printType(TTP); + else if (const auto *NTTP = dyn_cast(Param)) + OS << printType(NTTP, PP); + else if (const auto *TTPD = dyn_cast(Param)) + OS << printType(TTPD, PP); + } + // FIXME: TemplateTemplateParameter doesn't store the info on whether this + // param was a "typename" or "class". + OS << "> class"; + return OS.str(); +} + std::vector fetchTemplateParameters(const TemplateParameterList *Params, const PrintingPolicy &PP) { @@ -142,38 +170,30 @@ fetchTemplateParameters(const TemplateParameterList *Params, for (const Decl *Param : *Params) { HoverInfo::Param P; if (const auto *TTP = dyn_cast(Param)) { - P.Type = TTP->wasDeclaredWithTypename() ? "typename" : "class"; - if (TTP->isParameterPack()) - *P.Type += "..."; + P.Type = printType(TTP); if (!TTP->getName().empty()) P.Name = TTP->getNameAsString(); + if (TTP->hasDefaultArgument()) P.Default = TTP->getDefaultArgument().getAsString(PP); } else if (const auto *NTTP = dyn_cast(Param)) { + P.Type = printType(NTTP, PP); + if (IdentifierInfo *II = NTTP->getIdentifier()) P.Name = II->getName().str(); - P.Type = printType(NTTP->getType(), PP); - if (NTTP->isParameterPack()) - *P.Type += "..."; - if (NTTP->hasDefaultArgument()) { P.Default.emplace(); llvm::raw_string_ostream Out(*P.Default); NTTP->getDefaultArgument()->printPretty(Out, nullptr, PP); } } else if (const auto *TTPD = dyn_cast(Param)) { - P.Type.emplace(); - llvm::raw_string_ostream OS(*P.Type); - OS << "template <"; - printParams(OS, - fetchTemplateParameters(TTPD->getTemplateParameters(), PP)); - OS << "> class"; // FIXME: TemplateTemplateParameter doesn't store the - // info on whether this param was a "typename" or - // "class". + P.Type = printType(TTPD, PP); + if (!TTPD->getName().empty()) P.Name = TTPD->getNameAsString(); + if (TTPD->hasDefaultArgument()) { P.Default.emplace(); llvm::raw_string_ostream Out(*P.Default); @@ -385,6 +405,10 @@ HoverInfo getHoverContents(const NamedDecl *D, const SymbolIndex *Index) { fillFunctionTypeAndParams(HI, D, FD, Policy); else if (const auto *VD = dyn_cast(D)) HI.Type = printType(VD->getType(), Policy); + else if (const auto *TTP = dyn_cast(D)) + HI.Type = TTP->wasDeclaredWithTypename() ? "typename" : "class"; + else if (const auto *TTP = dyn_cast(D)) + HI.Type = printType(TTP, Policy); // Fill in value with evaluated initializer if possible. if (const auto *Var = dyn_cast(D)) { diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp index 1e71c2ab37f5e..8e89c1f45f3a5 100644 --- a/clang-tools-extra/clangd/Protocol.cpp +++ b/clang-tools-extra/clangd/Protocol.cpp @@ -14,6 +14,7 @@ #include "Logger.h" #include "URI.h" #include "clang/Basic/LLVM.h" +#include "clang/Index/IndexSymbol.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" @@ -261,9 +262,13 @@ SymbolKind indexSymbolKindToSymbolKind(index::SymbolKind Kind) { case index::SymbolKind::ConversionFunction: return SymbolKind::Function; case index::SymbolKind::Parameter: + case index::SymbolKind::NonTypeTemplateParm: return SymbolKind::Variable; case index::SymbolKind::Using: return SymbolKind::Namespace; + case index::SymbolKind::TemplateTemplateParm: + case index::SymbolKind::TemplateTypeParm: + return SymbolKind::TypeParameter; } llvm_unreachable("invalid symbol kind"); } diff --git a/clang-tools-extra/clangd/Quality.cpp b/clang-tools-extra/clangd/Quality.cpp index d80790fc98083..2261ff61e4990 100644 --- a/clang-tools-extra/clangd/Quality.cpp +++ b/clang-tools-extra/clangd/Quality.cpp @@ -129,6 +129,8 @@ categorize(const index::SymbolInfo &D) { case index::SymbolKind::Extension: case index::SymbolKind::Union: case index::SymbolKind::TypeAlias: + case index::SymbolKind::TemplateTypeParm: + case index::SymbolKind::TemplateTemplateParm: return SymbolQualitySignals::Type; case index::SymbolKind::Function: case index::SymbolKind::ClassMethod: @@ -147,6 +149,7 @@ categorize(const index::SymbolInfo &D) { case index::SymbolKind::Field: case index::SymbolKind::EnumConstant: case index::SymbolKind::Parameter: + case index::SymbolKind::NonTypeTemplateParm: return SymbolQualitySignals::Variable; case index::SymbolKind::Using: case index::SymbolKind::Module: diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp index 5a1caa9645209..f59c19e8031ee 100644 --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -842,13 +842,7 @@ std::string renderTUAction(const TUAction &Action) { } // namespace unsigned getDefaultAsyncThreadsCount() { - unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency(); - // heavyweight_hardware_concurrency may fall back to hardware_concurrency. - // C++ standard says that hardware_concurrency() may return 0; fallback to 1 - // worker thread in that case. - if (HardwareConcurrency == 0) - return 1; - return HardwareConcurrency; + return llvm::heavyweight_hardware_concurrency().compute_thread_count(); } FileStatus TUStatus::render(PathRef File) const { diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp index ab80113a4a955..c2541237c3c93 100644 --- a/clang-tools-extra/clangd/index/Background.cpp +++ b/clang-tools-extra/clangd/index/Background.cpp @@ -148,9 +148,10 @@ BackgroundIndex::BackgroundIndex( CDB.watch([&](const std::vector &ChangedFiles) { enqueue(ChangedFiles); })) { - assert(ThreadPoolSize > 0 && "Thread pool size can't be zero."); + assert(Rebuilder.TUsBeforeFirstBuild > 0 && + "Thread pool size can't be zero."); assert(this->IndexStorageFactory && "Storage factory can not be null!"); - for (unsigned I = 0; I < ThreadPoolSize; ++I) { + for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) { ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] { WithContext Ctx(this->BackgroundContext.clone()); Queue.work([&] { Rebuilder.idle(); }); diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h index b11008de15d02..2ae11c72d5d43 100644 --- a/clang-tools-extra/clangd/index/Background.h +++ b/clang-tools-extra/clangd/index/Background.h @@ -135,7 +135,7 @@ class BackgroundIndex : public SwapIndex { Context BackgroundContext, const FileSystemProvider &, const GlobalCompilationDatabase &CDB, BackgroundIndexStorage::Factory IndexStorageFactory, - size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency(), + size_t ThreadPoolSize = 0, // 0 = use all hardware threads std::function OnProgress = nullptr); ~BackgroundIndex(); // Blocks while the current task finishes. diff --git a/clang-tools-extra/clangd/index/BackgroundRebuild.h b/clang-tools-extra/clangd/index/BackgroundRebuild.h index d74c28be5cfb1..295f705c98e8f 100644 --- a/clang-tools-extra/clangd/index/BackgroundRebuild.h +++ b/clang-tools-extra/clangd/index/BackgroundRebuild.h @@ -49,7 +49,9 @@ class BackgroundIndexRebuilder { public: BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source, unsigned Threads) - : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {} + : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads) + .compute_thread_count()), + Target(Target), Source(Source) {} // Called to indicate a TU has been indexed. // May rebuild, if enough TUs have been indexed. diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp index 8b9b1d0033a5a..c5dd09b995087 100644 --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -13,6 +13,7 @@ #include "ParsedAST.h" #include "Selection.h" #include "SourceCode.h" +#include "Trace.h" #include "index/SymbolCollector.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclTemplate.h" @@ -124,6 +125,7 @@ llvm::Optional renameable(const NamedDecl &RenameDecl, StringRef MainFilePath, const SymbolIndex *Index, bool CrossFile) { + trace::Span Tracer("Renameable"); // Filter out symbols that are unsupported in both rename modes. if (llvm::isa(&RenameDecl)) return ReasonToReject::UnsupportedSymbol; @@ -225,6 +227,7 @@ llvm::Error makeError(ReasonToReject Reason) { // Return all rename occurrences in the main file. std::vector findOccurrencesWithinFile(ParsedAST &AST, const NamedDecl &ND) { + trace::Span Tracer("FindOccurrenceeWithinFile"); // If the cursor is at the underlying CXXRecordDecl of the // ClassTemplateDecl, ND will be the CXXRecordDecl. In this case, we need to // get the primary template maunally. @@ -260,6 +263,7 @@ std::vector findOccurrencesWithinFile(ParsedAST &AST, llvm::Expected renameWithinFile(ParsedAST &AST, const NamedDecl &RenameDecl, llvm::StringRef NewName) { + trace::Span Tracer("RenameWithinFile"); const SourceManager &SM = AST.getSourceManager(); tooling::Replacements FilteredChanges; @@ -319,6 +323,7 @@ std::vector getConstructors(const NamedDecl *ND) { llvm::Expected>> findOccurrencesOutsideFile(const NamedDecl &RenameDecl, llvm::StringRef MainFile, const SymbolIndex &Index) { + trace::Span Tracer("FindOccurrencesOutsideFile"); RefsRequest RQuest; RQuest.IDs.insert(*getSymbolID(&RenameDecl)); // Classes and their constructors are different symbols, and have different @@ -361,6 +366,9 @@ findOccurrencesOutsideFile(const NamedDecl &RenameDecl, auto &Ranges = FileAndOccurrences.getValue(); llvm::sort(Ranges); Ranges.erase(std::unique(Ranges.begin(), Ranges.end()), Ranges.end()); + + SPAN_ATTACH(Tracer, FileAndOccurrences.first(), + static_cast(Ranges.size())); } return AffectedFiles; } @@ -381,6 +389,7 @@ llvm::Expected renameOutsideFile( const NamedDecl &RenameDecl, llvm::StringRef MainFilePath, llvm::StringRef NewName, const SymbolIndex &Index, llvm::function_ref(PathRef)> GetFileContent) { + trace::Span Tracer("RenameOutsideFile"); auto AffectedFiles = findOccurrencesOutsideFile(RenameDecl, MainFilePath, Index); if (!AffectedFiles) @@ -463,6 +472,7 @@ void findNearMiss( } // namespace llvm::Expected rename(const RenameInputs &RInputs) { + trace::Span Tracer("Rename flow"); ParsedAST &AST = RInputs.AST; const SourceManager &SM = AST.getSourceManager(); llvm::StringRef MainFileCode = SM.getBufferData(SM.getMainFileID()); @@ -555,6 +565,11 @@ llvm::Expected buildRenameEdit(llvm::StringRef AbsFilePath, llvm::StringRef InitialCode, std::vector Occurrences, llvm::StringRef NewName) { + trace::Span Tracer("BuildRenameEdit"); + SPAN_ATTACH(Tracer, "file_path", AbsFilePath); + SPAN_ATTACH(Tracer, "rename_occurrences", + static_cast(Occurrences.size())); + assert(std::is_sorted(Occurrences.begin(), Occurrences.end())); assert(std::unique(Occurrences.begin(), Occurrences.end()) == Occurrences.end() && @@ -618,6 +633,7 @@ llvm::Expected buildRenameEdit(llvm::StringRef AbsFilePath, llvm::Optional> adjustRenameRanges(llvm::StringRef DraftCode, llvm::StringRef Identifier, std::vector Indexed, const LangOptions &LangOpts) { + trace::Span Tracer("AdjustRenameRanges"); assert(!Indexed.empty()); assert(std::is_sorted(Indexed.begin(), Indexed.end())); std::vector Lexed = @@ -628,12 +644,16 @@ adjustRenameRanges(llvm::StringRef DraftCode, llvm::StringRef Identifier, llvm::Optional> getMappedRanges(ArrayRef Indexed, ArrayRef Lexed) { + trace::Span Tracer("GetMappedRanges"); assert(!Indexed.empty()); assert(std::is_sorted(Indexed.begin(), Indexed.end())); assert(std::is_sorted(Lexed.begin(), Lexed.end())); if (Indexed.size() > Lexed.size()) { vlog("The number of lexed occurrences is less than indexed occurrences"); + SPAN_ATTACH( + Tracer, "error", + "The number of lexed occurrences is less than indexed occurrences"); return llvm::None; } // Fast check for the special subset case. @@ -660,15 +680,18 @@ llvm::Optional> getMappedRanges(ArrayRef Indexed, }); if (HasMultiple) { vlog("The best near miss is not unique."); + SPAN_ATTACH(Tracer, "error", "The best near miss is not unique"); return llvm::None; } if (Best.empty()) { vlog("Didn't find a near miss."); + SPAN_ATTACH(Tracer, "error", "Didn't find a near miss"); return llvm::None; } std::vector Mapped; for (auto I : Best) Mapped.push_back(Lexed[I]); + SPAN_ATTACH(Tracer, "mapped_ranges", static_cast(Mapped.size())); return Mapped; } diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index a39c7431044f4..f9ffe11673380 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -481,7 +481,7 @@ TEST(CompletionTest, Kinds) { AllOf(Has("function", CompletionItemKind::Function), Has("variable", CompletionItemKind::Variable), Has("int", CompletionItemKind::Keyword), - Has("Struct", CompletionItemKind::Class), + Has("Struct", CompletionItemKind::Struct), Has("MACRO", CompletionItemKind::Text), Has("indexFunction", CompletionItemKind::Function), Has("indexVariable", CompletionItemKind::Variable), @@ -529,6 +529,17 @@ TEST(CompletionTest, Kinds) { AllOf(Named("complete_variable"), Kind(CompletionItemKind::Variable)), AllOf(Named("complete_static_member"), Kind(CompletionItemKind::Property)))); + + Results = completions( + R"cpp( + enum Color { + Red + }; + Color u = ^ + )cpp"); + EXPECT_THAT(Results.Completions, + Contains( + AllOf(Named("Red"), Kind(CompletionItemKind::EnumMember)))); } TEST(CompletionTest, NoDuplicates) { diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index 2876e2f31c135..503b4d2afa42a 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -573,6 +573,42 @@ class Foo {})cpp"; // pattern. HI.Documentation = "comment from primary"; }}, + {// Template Type Parameter + R"cpp( + template void foo(); + )cpp", + [](HoverInfo &HI) { + HI.Name = "T"; + HI.Kind = index::SymbolKind::TemplateTypeParm; + HI.NamespaceScope = ""; + HI.Definition = "typename T = int"; + HI.LocalScope = "foo::"; + HI.Type = "typename"; + }}, + {// TemplateTemplate Type Parameter + R"cpp( + template class [[^T]]> void foo(); + )cpp", + [](HoverInfo &HI) { + HI.Name = "T"; + HI.Kind = index::SymbolKind::TemplateTemplateParm; + HI.NamespaceScope = ""; + HI.Definition = "template class T"; + HI.LocalScope = "foo::"; + HI.Type = "template class"; + }}, + {// NonType Template Parameter + R"cpp( + template void foo(); + )cpp", + [](HoverInfo &HI) { + HI.Name = "T"; + HI.Kind = index::SymbolKind::NonTypeTemplateParm; + HI.NamespaceScope = ""; + HI.Definition = "int T = 5"; + HI.LocalScope = "foo::"; + HI.Type = "int"; + }}, }; for (const auto &Case : Cases) { SCOPED_TRACE(Case.Code); diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 6c8c9f8020823..856d5e34bbcc2 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -1190,50 +1190,8 @@ installed. Controlling Floating Point Behavior ----------------------------------- -Clang provides a number of ways to control floating point behavior, including -with command line options and source pragmas. This section -describes the various floating point semantic modes and the corresponding options. - -.. csv-table:: Floating Point Semantic Modes - :header: "Mode", "Values" - :widths: 15, 30, 30 - - "except_behavior", "{ignore, strict, may_trap}", "ffp-exception-behavior" - "fenv_access", "{off, on}", "(none)" - "rounding_mode", "{dynamic, tonearest, downward, upward, towardzero}", "frounding-math" - "contract", "{on, off, fast}", "ffp-contract" - "denormal_fp_math", "{IEEE, PreserveSign, PositiveZero}", "fdenormal-fp-math" - "denormal_fp32_math", "{IEEE, PreserveSign, PositiveZero}", "fdenormal-fp-math-fp32" - "support_math_errno", "{on, off}", "fmath-errno" - "no_honor_nans", "{on, off}", "fhonor-nans" - "no_honor_infinities", "{on, off}", "fhonor-infinities" - "no_signed_zeros", "{on, off}", "fsigned-zeros" - "allow_reciprocal", "{on, off}", "freciprocal-math" - "allow_approximate_fns", "{on, off}", "(none)" - "allow_reassociation", "{on, off}", "fassociative-math" - - -This table describes the option settings that correspond to the three -floating point semantic models: precise (the default), strict, and fast. - - -.. csv-table:: Floating Point Models - :header: "Mode", "Precise", "Strict", "Fast" - :widths: 25, 15, 15, 15 - - "except_behavior", "ignore", "strict", "ignore" - "fenv_access", "off", "on", "off" - "rounding_mode", "tonearest", "dynamic", "tonearest" - "contract", "on", "off", "fast" - "denormal_fp_math", "IEEE", "IEEE", "PreserveSign" - "denormal_fp32_math", "IEEE","IEEE", "PreserveSign" - "support_math_errno", "on", "on", "off" - "no_honor_nans", "off", "off", "on" - "no_honor_infinities", "off", "off", "on" - "no_signed_zeros", "off", "off", "on" - "allow_reciprocal", "off", "off", "on" - "allow_approximate_fns", "off", "off", "on" - "allow_reassociation", "off", "off", "on" +Clang provides a number of ways to control floating point behavior. The options +are listed below. .. option:: -ffast-math @@ -1427,7 +1385,7 @@ Note that floating-point operations performed as part of constant initialization and ``fast``. Details: - * ``precise`` Disables optimizations that are not value-safe on floating-point data, although FP contraction (FMA) is enabled (``-ffp-contract=on``). This is the default behavior. + * ``precise`` Disables optimizations that are not value-safe on floating-point data, although FP contraction (FMA) is enabled (``-ffp-contract=fast``). This is the default behavior. * ``strict`` Enables ``-frounding-math`` and ``-ffp-exception-behavior=strict``, and disables contractions (FMA). All of the ``-ffast-math`` enablements are disabled. * ``fast`` Behaves identically to specifying both ``-ffast-math`` and ``ffp-contract=fast`` diff --git a/clang/include/clang-c/BuildSystem.h b/clang/include/clang-c/BuildSystem.h index 4e9f6dee02795..296e61247cef5 100644 --- a/clang/include/clang-c/BuildSystem.h +++ b/clang/include/clang-c/BuildSystem.h @@ -117,7 +117,7 @@ clang_ModuleMapDescriptor_setFrameworkModuleName(CXModuleMapDescriptor, const char *name); /** - * Sets the umbrealla header name that the module.map describes. + * Sets the umbrella header name that the module.map describes. * \returns 0 for success, non-zero to indicate an error. */ CINDEX_LINKAGE enum CXErrorCode diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index b653995ebbd01..efb96f3cc5b6b 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -3745,7 +3745,7 @@ CINDEX_LINKAGE unsigned clang_Type_getNumObjCProtocolRefs(CXType T); CINDEX_LINKAGE CXCursor clang_Type_getObjCProtocolDecl(CXType T, unsigned i); /** - * Retreive the number of type arguments associated with an ObjC object. + * Retrieve the number of type arguments associated with an ObjC object. * * If the type is not an ObjC object, 0 is returned. */ diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index bbaa46363d971..b2b53e80dc95f 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -17,6 +17,7 @@ #include "clang/AST/AttrIterator.h" #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/OpenMPClause.h" #include "clang/AST/Type.h" #include "clang/Basic/AttrKinds.h" #include "clang/Basic/AttributeCommonInfo.h" diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h index 73dc4ddab8983..954b9bc15789b 100644 --- a/clang/include/clang/AST/DeclObjC.h +++ b/clang/include/clang/AST/DeclObjC.h @@ -402,7 +402,7 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext { } /// createImplicitParams - Used to lazily create the self and cmd - /// implict parameters. This must be called prior to using getSelfDecl() + /// implicit parameters. This must be called prior to using getSelfDecl() /// or getCmdDecl(). The call is ignored if the implicit parameters /// have already been created. void createImplicitParams(ASTContext &Context, const ObjCInterfaceDecl *ID); diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index f103530457ee3..ec470100f4ca2 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -31,6 +31,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPContext.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/TrailingObjects.h" @@ -6658,6 +6659,53 @@ class OMPClausePrinter final : public OMPClauseVisitor { #include "clang/Basic/OpenMPKinds.def" }; +/// Helper data structure representing the traits in a match clause of an +/// `declare variant` or `metadirective`. The outer level is an ordered +/// collection of selector sets, each with an associated kind and an ordered +/// collection of selectors. A selector has a kind, an optional score/condition, +/// and an ordered collection of properties. +struct OMPTraitInfo { + struct OMPTraitProperty { + llvm::omp::TraitProperty Kind = llvm::omp::TraitProperty::invalid; + }; + struct OMPTraitSelector { + Expr *ScoreOrCondition = nullptr; + llvm::omp::TraitSelector Kind = llvm::omp::TraitSelector::invalid; + llvm::SmallVector Properties; + }; + struct OMPTraitSet { + llvm::omp::TraitSet Kind = llvm::omp::TraitSet::invalid; + llvm::SmallVector Selectors; + }; + + /// The outermost level of selector sets. + llvm::SmallVector Sets; + + bool anyScoreOrCondition( + llvm::function_ref Cond) { + return llvm::any_of(Sets, [Cond](OMPTraitInfo::OMPTraitSet &Set) { + return llvm::any_of( + Set.Selectors, [Cond](OMPTraitInfo::OMPTraitSelector &Selector) { + return Cond(Selector.ScoreOrCondition, + /* IsScore */ Selector.Kind != + llvm::omp::TraitSelector::user_condition); + }); + }); + } + + /// Create a variant match info object from this trait info object. While the + /// former is a flat representation the actual main difference is that the + /// latter uses clang::Expr to store the score/condition while the former is + /// independent of clang. Thus, expressions and conditions are evaluated in + /// this method. + void getAsVariantMatchInfo(ASTContext &ASTCtx, + llvm::omp::VariantMatchInfo &VMI) const; + + /// Print a human readable representation into \p OS. + void print(llvm::raw_ostream &OS, const PrintingPolicy &Policy) const; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo &TI); + } // namespace clang #endif // LLVM_CLANG_AST_OPENMPCLAUSE_H diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 6eeaf05ec71ba..c454c4a80500b 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -180,6 +180,27 @@ class FunctionArgument : Argument : Argument; + +// An argument of a OMPDeclareVariantAttribute that represents the `match` +// clause of the declare variant by keeping the information (incl. nesting) in +// an OMPTraitInfo object. +// +// With some exceptions, the `match()` clause looks roughly +// as follows: +// context-selector := list +// selector-set := ={list} +// selector := ([score():] list) +// trait := +// +// The structure of an OMPTraitInfo object is a tree as defined below: +// +// OMPTraitInfo := {list} +// OMPTraitSet := {Kind, list} +// OMPTraitSelector := {Kind, Expr, list} +// OMPTraitProperty := {Kind} +// +class OMPTraitInfoArgument : Argument; + class TypeArgument : Argument; class UnsignedArgument : Argument; class VariadicUnsignedArgument : Argument; @@ -3705,20 +3726,10 @@ def OMPDeclareVariant : InheritableAttr { let Documentation = [OMPDeclareVariantDocs]; let Args = [ ExprArgument<"VariantFuncRef">, - VariadicExprArgument<"Scores">, - VariadicUnsignedArgument<"CtxSelectorSets">, - VariadicUnsignedArgument<"CtxSelectors">, - VariadicStringArgument<"ImplVendors">, - VariadicStringArgument<"DeviceKinds"> + OMPTraitInfoArgument<"TraitInfos">, ]; let AdditionalMembers = [{ - void printScore(raw_ostream & OS, const PrintingPolicy &Policy, unsigned I) const { - if (const Expr *E = *std::next(scores_begin(), I)) { - OS << "score("; - E->printPretty(OS, nullptr, Policy); - OS << "):"; - } - } + ~OMPDeclareVariantAttr() { delete traitInfos; } void printPrettyPragma(raw_ostream & OS, const PrintingPolicy &Policy) const { if (const Expr *E = getVariantFuncRef()) { @@ -3726,66 +3737,8 @@ def OMPDeclareVariant : InheritableAttr { E->printPretty(OS, nullptr, Policy); OS << ")"; } - // TODO: add printing of real context selectors. OS << " match("; - int Used[OMP_CTX_SET_unknown] = {0}; - for (unsigned I = 0, E = ctxSelectorSets_size(); I < E; ++I) { - auto CtxSet = static_cast( - *std::next(ctxSelectorSets_begin(), I)); - if (Used[CtxSet]) - continue; - if (I > 0) - OS << ","; - switch (CtxSet) { - case OMP_CTX_SET_implementation: - OS << "implementation={"; - break; - case OMP_CTX_SET_device: - OS << "device={"; - break; - case OMP_CTX_SET_unknown: - llvm_unreachable("Unknown context selector set."); - } - Used[CtxSet] = 1; - for (unsigned K = I, EK = ctxSelectors_size(); K < EK; ++K) { - auto CtxSetK = static_cast( - *std::next(ctxSelectorSets_begin(), K)); - if (CtxSet != CtxSetK) - continue; - if (K != I) - OS << ","; - auto Ctx = static_cast( - *std::next(ctxSelectors_begin(), K)); - switch (Ctx) { - case OMP_CTX_vendor: - assert(CtxSet == OMP_CTX_SET_implementation && - "Expected implementation context selector set."); - OS << "vendor("; - printScore(OS, Policy, K); - if (implVendors_size() > 0) { - OS << *implVendors(). begin(); - for (StringRef VendorName : llvm::drop_begin(implVendors(), 1)) - OS << ", " << VendorName; - } - OS << ")"; - break; - case OMP_CTX_kind: - assert(CtxSet == OMP_CTX_SET_device && - "Expected device context selector set."); - OS << "kind("; - if (deviceKinds_size() > 0) { - OS << *deviceKinds().begin(); - for (StringRef KindName : llvm::drop_begin(deviceKinds(), 1)) - OS << ", " << KindName; - } - OS << ")"; - break; - case OMP_CTX_unknown: - llvm_unreachable("Unknown context selector."); - } - } - OS << "}"; - } + traitInfos->print(OS, Policy); OS << ")"; } }]; diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 8e62c0e8325d5..9410afb7aa027 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1258,30 +1258,68 @@ def err_omp_mapper_expected_declarator : Error< "expected declarator on 'omp declare mapper' directive">; def err_omp_declare_variant_wrong_clause : Error< "expected '%0' clause on 'omp declare variant' directive">; -def err_omp_declare_variant_no_ctx_selector : Error< - "expected context selector in '%0' clause on 'omp declare variant' directive">; -def err_omp_declare_variant_equal_expected : Error< - "expected '=' after '%0' context selector set name on 'omp declare variant' directive">; -def warn_omp_declare_variant_cs_name_expected : Warning< - "unknown context selector in '%0' context selector set of 'omp declare variant' directive, ignored">, - InGroup; -def err_omp_declare_variant_item_expected : Error< - "expected %0 in '%1' context selector of '%2' selector set of 'omp declare variant' directive">; -def err_omp_declare_variant_ctx_set_mutiple_use : Error< - "context selector set '%0' is used already in the same 'omp declare variant' directive">; -def note_omp_declare_variant_ctx_set_used_here : Note< - "previously context selector set '%0' used here">; -def err_omp_expected_comma_brace : Error<"expected '}' or ',' after '%0'">; -def err_omp_declare_variant_ctx_mutiple_use : Error< - "context trait selector '%0' is used already in the same '%1' context selector set of 'omp declare variant' directive">; -def note_omp_declare_variant_ctx_used_here : Note< - "previously context trait selector '%0' used here">; -def warn_omp_more_one_device_type_clause : Warning< - "more than one 'device_type' clause is specified">, - InGroup; -def err_omp_wrong_device_kind_trait : Error< - "unknown '%0' device kind trait in the 'device' context selector set, expected" - " one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'">; +def warn_omp_declare_variant_string_literal_or_identifier + : Warning<"expected identifier or string literal describing a context " + "%select{set|selector|property}0; " + "%select{set|selector|property}0 skipped">, + InGroup; +def note_omp_declare_variant_ctx_options + : Note<"context %select{set|selector|property}0 options are: %1">; +def warn_omp_declare_variant_expected + : Warning<"expected '%0' after the %1; '%0' assumed">, + InGroup; +def warn_omp_declare_variant_ctx_not_a_property + : Warning<"'%0' is not a valid context property for the context selector " + "'%1' and the context set '%2'; property ignored">, + InGroup; +def note_omp_declare_variant_ctx_is_a + : Note<"'%0' is a context %select{set|selector|property}1 not a context " + "%select{set|selector|property}2">; +def note_omp_declare_variant_ctx_try : Note<"try 'match(%0={%1%2})'">; +def warn_omp_declare_variant_ctx_not_a_selector + : Warning<"'%0' is not a valid context selector for the context set '%1'; " + "selector ignored">, + InGroup; +def warn_omp_declare_variant_ctx_not_a_set + : Warning<"'%0' is not a valid context set in a `declare variant`; set " + "ignored">, + InGroup; +def warn_omp_declare_variant_ctx_mutiple_use + : Warning<"the context %select{set|selector|property}0 '%1' was used " + "already in the same 'omp declare variant' directive; " + "%select{set|selector|property}0 ignored">, + InGroup; +def note_omp_declare_variant_ctx_used_here + : Note<"the previous context %select{set|selector|property}0 '%1' used " + "here">; +def note_omp_declare_variant_ctx_continue_here + : Note<"the ignored %select{set|selector|property}0 spans until here">; +def warn_omp_ctx_incompatible_selector_for_set + : Warning<"the context selector '%0' is not valid for the context set " + "'%1'; selector ignored">, + InGroup; +def note_omp_ctx_compatible_set_for_selector + : Note<"the context selector '%0' can be nested in the context set '%1'; " + "try 'match(%1={%0%select{|(property)}2})'">; +def warn_omp_ctx_selector_without_properties + : Warning<"the context selector '%0' in context set '%1' requires a " + "context property defined in parentheses; selector ignored">, + InGroup; +def warn_omp_ctx_incompatible_property_for_selector + : Warning<"the context property '%0' is not valid for the context selector " + "'%1' and the context set '%2'; property ignored">, + InGroup; +def note_omp_ctx_compatible_set_and_selector_for_property + : Note<"the context property '%0' can be nested in the context selector " + "'%1' which is nested in the context set '%2'; try " + "'match(%2={%1(%0)})'">; +def warn_omp_ctx_incompatible_score_for_property + : Warning<"the context selector '%0' in the context set '%1' cannot have a " + "score ('%2'); score ignored">, + InGroup; +def warn_omp_more_one_device_type_clause + : Warning<"more than one 'device_type' clause is specified">, + InGroup; // Pragma loop support. def err_pragma_loop_missing_argument : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 49c590633767a..3d446ec740fe8 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9951,6 +9951,12 @@ def warn_omp_declare_target_after_first_use : Warning< InGroup; def err_omp_declare_variant_incompat_attributes : Error< "'#pragma omp declare variant' is not compatible with any target-specific attributes">; +def warn_omp_declare_variant_score_not_constant + : Warning<"score expressions in the OpenMP context selector need to be " + "constant; %0 is not and will be ignored">; +def err_omp_declare_variant_user_condition_not_constant + : Error<"the user condition in the OpenMP context selector needs to be " + "constant; %0 is not">; def warn_omp_declare_variant_after_used : Warning< "'#pragma omp declare variant' cannot be applied for function after first " "usage; the original function might be used">, InGroup; diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index dd840b270e636..3ab69a1bb3f1f 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -203,12 +203,6 @@ #ifndef OPENMP_DECLARE_VARIANT_CLAUSE #define OPENMP_DECLARE_VARIANT_CLAUSE(Name) #endif -#ifndef OPENMP_CONTEXT_SELECTOR_SET -#define OPENMP_CONTEXT_SELECTOR_SET(Name) -#endif -#ifndef OPENMP_CONTEXT_SELECTOR -#define OPENMP_CONTEXT_SELECTOR(Name) -#endif #ifndef OPENMP_LASTPRIVATE_KIND #define OPENMP_LASTPRIVATE_KIND(Name) #endif @@ -219,14 +213,6 @@ #define OPENMP_FLUSH_CLAUSE(Name) #endif -// OpenMP context selector sets. -OPENMP_CONTEXT_SELECTOR_SET(implementation) -OPENMP_CONTEXT_SELECTOR_SET(device) - -// OpenMP context selectors. -OPENMP_CONTEXT_SELECTOR(vendor) -OPENMP_CONTEXT_SELECTOR(kind) - // OpenMP clauses. OPENMP_CLAUSE(allocator, OMPAllocatorClause) OPENMP_CLAUSE(if, OMPIfClause) @@ -1102,8 +1088,6 @@ OPENMP_FLUSH_CLAUSE(release) #undef OPENMP_FLUSH_CLAUSE #undef OPENMP_ORDER_KIND #undef OPENMP_LASTPRIVATE_KIND -#undef OPENMP_CONTEXT_SELECTOR -#undef OPENMP_CONTEXT_SELECTOR_SET #undef OPENMP_DECLARE_VARIANT_CLAUSE #undef OPENMP_DEVICE_TYPE_KIND #undef OPENMP_ALLOCATE_CLAUSE diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 86c4ad1f754d6..2a08ef6d372aa 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -19,45 +19,6 @@ namespace clang { -/// OpenMP context selector sets. -enum OpenMPContextSelectorSetKind { -#define OPENMP_CONTEXT_SELECTOR_SET(Name) OMP_CTX_SET_##Name, -#include "clang/Basic/OpenMPKinds.def" - OMP_CTX_SET_unknown, -}; - -/// OpenMP context selectors. -enum OpenMPContextSelectorKind { -#define OPENMP_CONTEXT_SELECTOR(Name) OMP_CTX_##Name, -#include "clang/Basic/OpenMPKinds.def" - OMP_CTX_unknown, -}; - -OpenMPContextSelectorSetKind getOpenMPContextSelectorSet(llvm::StringRef Str); -llvm::StringRef -getOpenMPContextSelectorSetName(OpenMPContextSelectorSetKind Kind); -OpenMPContextSelectorKind getOpenMPContextSelector(llvm::StringRef Str); -llvm::StringRef getOpenMPContextSelectorName(OpenMPContextSelectorKind Kind); - -/// Struct to store the context selectors info. -template struct OpenMPCtxSelectorData { - OpenMPContextSelectorSetKind CtxSet = OMP_CTX_SET_unknown; - OpenMPContextSelectorKind Ctx = OMP_CTX_unknown; - ScoreT Score; - VectorType Names; - explicit OpenMPCtxSelectorData() = default; - explicit OpenMPCtxSelectorData(OpenMPContextSelectorSetKind CtxSet, - OpenMPContextSelectorKind Ctx, - const ScoreT &Score, VectorType &&Names) - : CtxSet(CtxSet), Ctx(Ctx), Score(Score), Names(Names) {} - template - explicit OpenMPCtxSelectorData(OpenMPContextSelectorSetKind CtxSet, - OpenMPContextSelectorKind Ctx, - const ScoreT &Score, const U &Names) - : CtxSet(CtxSet), Ctx(Ctx), Score(Score), - Names(Names.begin(), Names.end()) {} -}; - /// OpenMP directives. using OpenMPDirectiveKind = llvm::omp::Directive; diff --git a/clang/include/clang/Index/IndexSymbol.h b/clang/include/clang/Index/IndexSymbol.h index 2e1e6005d68a6..de98b8147e8ad 100644 --- a/clang/include/clang/Index/IndexSymbol.h +++ b/clang/include/clang/Index/IndexSymbol.h @@ -54,6 +54,9 @@ enum class SymbolKind : uint8_t { Parameter, Using, + TemplateTypeParm, + TemplateTemplateParm, + NonTypeTemplateParm, }; enum class SymbolLanguage : uint8_t { diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 5cf1bd457eb07..c8d112054b478 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1701,6 +1701,8 @@ class Parser : public CodeCompletionHandler { unsigned &NumLineToksConsumed, bool IsUnevaluated); + ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false); + private: ExprResult ParseExpressionWithLeadingAt(SourceLocation AtLoc); @@ -1794,8 +1796,6 @@ class Parser : public CodeCompletionHandler { SourceLocation LParenLoc, SourceLocation RParenLoc); - ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false); - ExprResult ParseGenericSelectionExpression(); ExprResult ParseObjCBoolLiteral(); @@ -2929,11 +2929,39 @@ class Parser : public CodeCompletionHandler { DeclGroupPtrTy ParseOMPDeclareSimdClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks, SourceLocation Loc); - /// Parses OpenMP context selectors and calls \p Callback for each - /// successfully parsed context selector. - bool - parseOpenMPContextSelectors(SourceLocation Loc, - SmallVectorImpl &Data); + + /// Parse a property kind into \p TIProperty for the selector set \p Set and + /// selector \p Selector. + void parseOMPTraitPropertyKind(OMPTraitInfo::OMPTraitProperty &TIProperty, + llvm::omp::TraitSet Set, + llvm::omp::TraitSelector Selector, + llvm::StringMap &Seen); + + /// Parse a selector kind into \p TISelector for the selector set \p Set. + void parseOMPTraitSelectorKind(OMPTraitInfo::OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &Seen); + + /// Parse a selector set kind into \p TISet. + void parseOMPTraitSetKind(OMPTraitInfo::OMPTraitSet &TISet, + llvm::StringMap &Seen); + + /// Parses an OpenMP context property. + void parseOMPContextProperty(OMPTraitInfo::OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &Seen); + + /// Parses an OpenMP context selector. + void parseOMPContextSelector(OMPTraitInfo::OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &SeenSelectors); + + /// Parses an OpenMP context selector set. + void parseOMPContextSelectorSet(OMPTraitInfo::OMPTraitSet &TISet, + llvm::StringMap &SeenSets); + + /// Parses OpenMP context selectors. + bool parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI); /// Parse clauses for '#pragma omp declare variant'. void ParseOMPDeclareVariantClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0bc80daf06993..4ccb92aa22d87 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -9869,9 +9869,6 @@ class Sema final { public: /// Struct to store the context selectors info for declare variant directive. - using OMPCtxStringType = SmallString<8>; - using OMPCtxSelectorData = - OpenMPCtxSelectorData, ExprResult>; /// Checks if the variant/multiversion functions are compatible. bool areMultiversionVariantFunctionsCompatible( @@ -10343,10 +10340,12 @@ class Sema final { /// applied to. /// \param VariantRef Expression that references the variant function, which /// must be used instead of the original one, specified in \p DG. + /// \param TI The trait info object representing the match clause. /// \returns None, if the function/variant function are not compatible with /// the pragma, pair of original function/variant ref expression otherwise. - Optional> checkOpenMPDeclareVariantFunction( - DeclGroupPtrTy DG, Expr *VariantRef, SourceRange SR); + Optional> + checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef, + OMPTraitInfo &TI, SourceRange SR); /// Called on well-formed '\#pragma omp declare variant' after parsing of /// the associated method/function. @@ -10354,11 +10353,9 @@ class Sema final { /// applied to. /// \param VariantRef Expression that references the variant function, which /// must be used instead of the original one, specified in \p DG. - /// \param Data Set of context-specific data for the specified context - /// selector. + /// \param TI The context traits associated with the function variant. void ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD, Expr *VariantRef, - SourceRange SR, - ArrayRef Data); + OMPTraitInfo *TI, SourceRange SR); OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index f6dc8b2b7ae2d..362296024a970 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -22,6 +22,7 @@ #include "llvm/ADT/APSInt.h" namespace clang { +struct OMPTraitInfo; /// An object for streaming information from a record. class ASTRecordReader @@ -258,6 +259,9 @@ class ASTRecordReader return Reader->ReadCXXTemporary(*F, Record, Idx); } + /// Read an OMPTraitInfo object, advancing Idx. + OMPTraitInfo *readOMPTraitInfo(); + /// Read an OpenMP clause, advancing Idx. OMPClause *readOMPClause(); diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h index 43af68628ecc7..2a35c694ccf8d 100644 --- a/clang/include/clang/Serialization/ASTRecordWriter.h +++ b/clang/include/clang/Serialization/ASTRecordWriter.h @@ -266,6 +266,9 @@ class ASTRecordWriter void AddCXXDefinitionData(const CXXRecordDecl *D); + /// Write an OMPTraitInfo object. + void writeOMPTraitInfo(OMPTraitInfo *TI); + void writeOMPClause(OMPClause *C); /// Emit a string. diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 6eac98250c8ff..1cd1c82c8f9d4 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1722,3 +1722,107 @@ void OMPClausePrinter::VisitOMPOrderClause(OMPOrderClause *Node) { OS << "order(" << getOpenMPSimpleClauseTypeName(OMPC_order, Node->getKind()) << ")"; } + +void OMPTraitInfo::getAsVariantMatchInfo( + ASTContext &ASTCtx, llvm::omp::VariantMatchInfo &VMI) const { + for (const OMPTraitSet &Set : Sets) { + for (const OMPTraitSelector &Selector : Set.Selectors) { + + // User conditions are special as we evaluate the condition here. + if (Selector.Kind == llvm::omp::TraitSelector::user_condition) { + assert(Selector.ScoreOrCondition && + "Ill-formed user condition, expected condition expression!"); + assert(Selector.Properties.size() == 1 && + Selector.Properties.front().Kind == + llvm::omp::TraitProperty::user_condition_unknown && + "Ill-formed user condition, expected unknown trait property!"); + + llvm::APInt CondVal = + Selector.ScoreOrCondition->EvaluateKnownConstInt(ASTCtx); + VMI.addTrait(CondVal.isNullValue() + ? llvm::omp::TraitProperty::user_condition_false + : llvm::omp::TraitProperty::user_condition_true); + continue; + } + + llvm::APInt Score; + llvm::APInt *ScorePtr = nullptr; + if (Selector.ScoreOrCondition) { + Score = Selector.ScoreOrCondition->EvaluateKnownConstInt(ASTCtx); + ScorePtr = &Score; + } + for (const OMPTraitProperty &Property : Selector.Properties) + VMI.addTrait(Set.Kind, Property.Kind, ScorePtr); + + if (Set.Kind != llvm::omp::TraitSet::construct) + continue; + + // TODO: This might not hold once we implement SIMD properly. + assert(Selector.Properties.size() == 1 && + Selector.Properties.front().Kind == + llvm::omp::getOpenMPContextTraitPropertyForSelector( + Selector.Kind) && + "Ill-formed construct selector!"); + + VMI.ConstructTraits.push_back(Selector.Properties.front().Kind); + } + } +} + +void OMPTraitInfo::print(llvm::raw_ostream &OS, + const PrintingPolicy &Policy) const { + bool FirstSet = true; + for (const OMPTraitInfo::OMPTraitSet &Set : Sets) { + if (!FirstSet) + OS << ", "; + FirstSet = false; + OS << llvm::omp::getOpenMPContextTraitSetName(Set.Kind) << "={"; + + bool FirstSelector = true; + for (const OMPTraitInfo::OMPTraitSelector &Selector : Set.Selectors) { + if (!FirstSelector) + OS << ", "; + FirstSelector = false; + OS << llvm::omp::getOpenMPContextTraitSelectorName(Selector.Kind); + + bool AllowsTraitScore = false; + bool RequiresProperty = false; + llvm::omp::isValidTraitSelectorForTraitSet( + Selector.Kind, Set.Kind, AllowsTraitScore, RequiresProperty); + + if (!RequiresProperty) + continue; + + OS << "("; + if (Selector.Kind == llvm::omp::TraitSelector::user_condition) { + Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy); + } else { + + if (Selector.ScoreOrCondition) { + OS << "score("; + Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy); + OS << "): "; + } + + bool FirstProperty = true; + for (const OMPTraitInfo::OMPTraitProperty &Property : + Selector.Properties) { + if (!FirstProperty) + OS << ", "; + FirstProperty = false; + OS << llvm::omp::getOpenMPContextTraitPropertyName(Property.Kind); + } + } + OS << ")"; + } + OS << "}"; + } +} + +llvm::raw_ostream &clang::operator<<(llvm::raw_ostream &OS, + const OMPTraitInfo &TI) { + LangOptions LO; + PrintingPolicy Policy(LO); + TI.print(OS, Policy); + return OS; +} diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 70817f8e464ae..ff0f287003bfc 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -20,49 +20,6 @@ using namespace clang; using namespace llvm::omp; -OpenMPContextSelectorSetKind -clang::getOpenMPContextSelectorSet(llvm::StringRef Str) { - return llvm::StringSwitch(Str) -#define OPENMP_CONTEXT_SELECTOR_SET(Name) .Case(#Name, OMP_CTX_SET_##Name) -#include "clang/Basic/OpenMPKinds.def" - .Default(OMP_CTX_SET_unknown); -} - -llvm::StringRef -clang::getOpenMPContextSelectorSetName(OpenMPContextSelectorSetKind Kind) { - switch (Kind) { - case OMP_CTX_SET_unknown: - return "unknown"; -#define OPENMP_CONTEXT_SELECTOR_SET(Name) \ - case OMP_CTX_SET_##Name: \ - return #Name; -#include "clang/Basic/OpenMPKinds.def" - break; - } - llvm_unreachable("Invalid OpenMP context selector set kind"); -} - -OpenMPContextSelectorKind clang::getOpenMPContextSelector(llvm::StringRef Str) { - return llvm::StringSwitch(Str) -#define OPENMP_CONTEXT_SELECTOR(Name) .Case(#Name, OMP_CTX_##Name) -#include "clang/Basic/OpenMPKinds.def" - .Default(OMP_CTX_unknown); -} - -llvm::StringRef -clang::getOpenMPContextSelectorName(OpenMPContextSelectorKind Kind) { - switch (Kind) { - case OMP_CTX_unknown: - return "unknown"; -#define OPENMP_CONTEXT_SELECTOR(Name) \ - case OMP_CTX_##Name: \ - return #Name; -#include "clang/Basic/OpenMPKinds.def" - break; - } - llvm_unreachable("Invalid OpenMP context selector kind"); -} - OpenMPClauseKind clang::getOpenMPClauseKind(StringRef Str) { // 'flush' clause cannot be specified explicitly, because this is an implicit // clause for 'flush' directive. If the 'flush' clause is explicitly specified diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index e41d4962f03a7..60b81492f78ea 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -11065,260 +11065,34 @@ Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF, return Address(Addr, Align); } -namespace { -using OMPContextSelectorData = - OpenMPCtxSelectorData, llvm::APSInt>; -using CompleteOMPContextSelectorData = SmallVector; -} // anonymous namespace - -/// Checks current context and returns true if it matches the context selector. -template -static bool checkContext(const OMPContextSelectorData &Data, - Arguments... Params) { - assert(Data.CtxSet != OMP_CTX_SET_unknown && Data.Ctx != OMP_CTX_unknown && - "Unknown context selector or context selector set."); - return false; -} - -/// Checks for implementation={vendor()} context selector. -/// \returns true iff ="llvm", false otherwise. -template <> -bool checkContext( - const OMPContextSelectorData &Data) { - return llvm::all_of(Data.Names, - [](StringRef S) { return !S.compare_lower("llvm"); }); -} - -/// Checks for device={kind()} context selector. -/// \returns true if ="host" and compilation is for host. -/// true if ="nohost" and compilation is for device. -/// true if ="cpu" and compilation is for Arm, X86 or PPC CPU. -/// true if ="gpu" and compilation is for NVPTX or AMDGCN. -/// false otherwise. -template <> -bool checkContext( - const OMPContextSelectorData &Data, CodeGenModule &CGM) { - for (StringRef Name : Data.Names) { - if (!Name.compare_lower("host")) { - if (CGM.getLangOpts().OpenMPIsDevice) - return false; - continue; - } - if (!Name.compare_lower("nohost")) { - if (!CGM.getLangOpts().OpenMPIsDevice) - return false; - continue; - } - switch (CGM.getTriple().getArch()) { - case llvm::Triple::arm: - case llvm::Triple::armeb: - case llvm::Triple::aarch64: - case llvm::Triple::aarch64_be: - case llvm::Triple::aarch64_32: - case llvm::Triple::ppc: - case llvm::Triple::ppc64: - case llvm::Triple::ppc64le: - case llvm::Triple::x86: - case llvm::Triple::x86_64: - if (Name.compare_lower("cpu")) - return false; - break; - case llvm::Triple::amdgcn: - case llvm::Triple::nvptx: - case llvm::Triple::nvptx64: - if (Name.compare_lower("gpu")) - return false; - break; - case llvm::Triple::UnknownArch: - case llvm::Triple::arc: - case llvm::Triple::avr: - case llvm::Triple::bpfel: - case llvm::Triple::bpfeb: - case llvm::Triple::hexagon: - case llvm::Triple::fpga_aoco: - case llvm::Triple::fpga_aocr: - case llvm::Triple::fpga_aocx: - case llvm::Triple::mips: - case llvm::Triple::mipsel: - case llvm::Triple::mips64: - case llvm::Triple::mips64el: - case llvm::Triple::msp430: - case llvm::Triple::r600: - case llvm::Triple::riscv32: - case llvm::Triple::riscv64: - case llvm::Triple::sparc: - case llvm::Triple::sparcv9: - case llvm::Triple::sparcel: - case llvm::Triple::systemz: - case llvm::Triple::tce: - case llvm::Triple::tcele: - case llvm::Triple::thumb: - case llvm::Triple::thumbeb: - case llvm::Triple::xcore: - case llvm::Triple::le32: - case llvm::Triple::le64: - case llvm::Triple::amdil: - case llvm::Triple::amdil64: - case llvm::Triple::hsail: - case llvm::Triple::hsail64: - case llvm::Triple::spir: - case llvm::Triple::spir64: - case llvm::Triple::kalimba: - case llvm::Triple::shave: - case llvm::Triple::lanai: - case llvm::Triple::wasm32: - case llvm::Triple::wasm64: - case llvm::Triple::renderscript32: - case llvm::Triple::renderscript64: - case llvm::Triple::ve: - return false; - } - } - return true; -} - -static bool matchesContext(CodeGenModule &CGM, - const CompleteOMPContextSelectorData &ContextData) { - for (const OMPContextSelectorData &Data : ContextData) { - switch (Data.Ctx) { - case OMP_CTX_vendor: - assert(Data.CtxSet == OMP_CTX_SET_implementation && - "Expected implementation context selector set."); - if (!checkContext(Data)) - return false; - break; - case OMP_CTX_kind: - assert(Data.CtxSet == OMP_CTX_SET_device && - "Expected device context selector set."); - if (!checkContext(Data, - CGM)) - return false; - break; - case OMP_CTX_unknown: - llvm_unreachable("Unknown context selector kind."); - } - } - return true; -} - -static CompleteOMPContextSelectorData -translateAttrToContextSelectorData(ASTContext &C, - const OMPDeclareVariantAttr *A) { - CompleteOMPContextSelectorData Data; - for (unsigned I = 0, E = A->scores_size(); I < E; ++I) { - Data.emplace_back(); - auto CtxSet = static_cast( - *std::next(A->ctxSelectorSets_begin(), I)); - auto Ctx = static_cast( - *std::next(A->ctxSelectors_begin(), I)); - Data.back().CtxSet = CtxSet; - Data.back().Ctx = Ctx; - const Expr *Score = *std::next(A->scores_begin(), I); - Data.back().Score = Score->EvaluateKnownConstInt(C); - switch (Ctx) { - case OMP_CTX_vendor: - assert(CtxSet == OMP_CTX_SET_implementation && - "Expected implementation context selector set."); - Data.back().Names = - llvm::makeArrayRef(A->implVendors_begin(), A->implVendors_end()); - break; - case OMP_CTX_kind: - assert(CtxSet == OMP_CTX_SET_device && - "Expected device context selector set."); - Data.back().Names = - llvm::makeArrayRef(A->deviceKinds_begin(), A->deviceKinds_end()); - break; - case OMP_CTX_unknown: - llvm_unreachable("Unknown context selector kind."); - } - } - return Data; -} - -static bool isStrictSubset(const CompleteOMPContextSelectorData &LHS, - const CompleteOMPContextSelectorData &RHS) { - llvm::SmallDenseMap, llvm::StringSet<>, 4> RHSData; - for (const OMPContextSelectorData &D : RHS) { - auto &Pair = RHSData.FindAndConstruct(std::make_pair(D.CtxSet, D.Ctx)); - Pair.getSecond().insert(D.Names.begin(), D.Names.end()); - } - bool AllSetsAreEqual = true; - for (const OMPContextSelectorData &D : LHS) { - auto It = RHSData.find(std::make_pair(D.CtxSet, D.Ctx)); - if (It == RHSData.end()) - return false; - if (D.Names.size() > It->getSecond().size()) - return false; - if (llvm::set_union(It->getSecond(), D.Names)) - return false; - AllSetsAreEqual = - AllSetsAreEqual && (D.Names.size() == It->getSecond().size()); - } - - return LHS.size() != RHS.size() || !AllSetsAreEqual; -} - -static bool greaterCtxScore(const CompleteOMPContextSelectorData &LHS, - const CompleteOMPContextSelectorData &RHS) { - // Score is calculated as sum of all scores + 1. - llvm::APSInt LHSScore(llvm::APInt(64, 1), /*isUnsigned=*/false); - bool RHSIsSubsetOfLHS = isStrictSubset(RHS, LHS); - if (RHSIsSubsetOfLHS) { - LHSScore = llvm::APSInt::get(0); - } else { - for (const OMPContextSelectorData &Data : LHS) { - if (Data.Score.getBitWidth() > LHSScore.getBitWidth()) { - LHSScore = LHSScore.extend(Data.Score.getBitWidth()) + Data.Score; - } else if (Data.Score.getBitWidth() < LHSScore.getBitWidth()) { - LHSScore += Data.Score.extend(LHSScore.getBitWidth()); - } else { - LHSScore += Data.Score; - } - } - } - llvm::APSInt RHSScore(llvm::APInt(64, 1), /*isUnsigned=*/false); - if (!RHSIsSubsetOfLHS && isStrictSubset(LHS, RHS)) { - RHSScore = llvm::APSInt::get(0); - } else { - for (const OMPContextSelectorData &Data : RHS) { - if (Data.Score.getBitWidth() > RHSScore.getBitWidth()) { - RHSScore = RHSScore.extend(Data.Score.getBitWidth()) + Data.Score; - } else if (Data.Score.getBitWidth() < RHSScore.getBitWidth()) { - RHSScore += Data.Score.extend(RHSScore.getBitWidth()); - } else { - RHSScore += Data.Score; - } - } - } - return llvm::APSInt::compareValues(LHSScore, RHSScore) >= 0; -} - /// Finds the variant function that matches current context with its context /// selector. static const FunctionDecl *getDeclareVariantFunction(CodeGenModule &CGM, const FunctionDecl *FD) { if (!FD->hasAttrs() || !FD->hasAttr()) return FD; - // Iterate through all DeclareVariant attributes and check context selectors. - const OMPDeclareVariantAttr *TopMostAttr = nullptr; - CompleteOMPContextSelectorData TopMostData; + + SmallVector VariantExprs; + SmallVector VMIs; for (const auto *A : FD->specific_attrs()) { - CompleteOMPContextSelectorData Data = - translateAttrToContextSelectorData(CGM.getContext(), A); - if (!matchesContext(CGM, Data)) + const OMPTraitInfo *TI = A->getTraitInfos(); + if (!TI) continue; - // If the attribute matches the context, find the attribute with the highest - // score. - if (!TopMostAttr || !greaterCtxScore(TopMostData, Data)) { - TopMostAttr = A; - TopMostData.swap(Data); - } + VMIs.push_back(VariantMatchInfo()); + TI->getAsVariantMatchInfo(CGM.getContext(), VMIs.back()); + VariantExprs.push_back(A->getVariantFuncRef()); } - if (!TopMostAttr) + + OMPContext Ctx(CGM.getLangOpts().OpenMPIsDevice, CGM.getTriple()); + // FIXME: Keep the context in the OMPIRBuilder so we can add constructs as we + // build them. + + int BestMatchIdx = getBestVariantMatchForContext(VMIs, Ctx); + if (BestMatchIdx < 0) return FD; + return cast( - cast(TopMostAttr->getVariantFuncRef()->IgnoreParenImpCasts()) + cast(VariantExprs[BestMatchIdx]->IgnoreParenImpCasts()) ->getDecl()); } diff --git a/clang/lib/Driver/ToolChains/Ananas.cpp b/clang/lib/Driver/ToolChains/Ananas.cpp index 2f11c9739a0eb..10e4ea70db41d 100644 --- a/clang/lib/Driver/ToolChains/Ananas.cpp +++ b/clang/lib/Driver/ToolChains/Ananas.cpp @@ -103,7 +103,7 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); - AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0], + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5208a1953d4c3..613d47fb3ad02 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2538,9 +2538,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath; llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math; - StringRef FPContract = "on"; + StringRef FPContract = ""; bool StrictFPModel = false; + if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) { CmdArgs.push_back("-mlimit-float-precision"); CmdArgs.push_back(A->getValue()); @@ -2563,6 +2564,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, SignedZeros = true; // -fno_fast_math restores default denormal and fpcontract handling DenormalFPMath = DefaultDenormalFPMath; + FPContract = ""; StringRef Val = A->getValue(); if (OFastEnabled && !Val.equals("fast")) { // Only -ffp-model=fast is compatible with OFast, ignore. @@ -2576,10 +2578,12 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, // ffp-model= is a Driver option, it is entirely rewritten into more // granular options before being passed into cc1. // Use the gcc option in the switch below. - if (!FPModel.empty() && !FPModel.equals(Val)) + if (!FPModel.empty() && !FPModel.equals(Val)) { D.Diag(clang::diag::warn_drv_overriding_flag_option) << Args.MakeArgString("-ffp-model=" + FPModel) << Args.MakeArgString("-ffp-model=" + Val); + FPContract = ""; + } if (Val.equals("fast")) { optID = options::OPT_ffast_math; FPModel = Val; @@ -2587,7 +2591,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, } else if (Val.equals("precise")) { optID = options::OPT_ffp_contract; FPModel = Val; - FPContract = "on"; + FPContract = "fast"; PreciseFPModel = true; } else if (Val.equals("strict")) { StrictFPModel = true; @@ -2673,11 +2677,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, case options::OPT_ffp_contract: { StringRef Val = A->getValue(); if (PreciseFPModel) { - // When -ffp-model=precise is seen on the command line, - // the boolean PreciseFPModel is set to true which indicates - // "the current option is actually PreciseFPModel". The optID - // is changed to OPT_ffp_contract and FPContract is set to "on". - // the argument Val string is "precise": it shouldn't be checked. + // -ffp-model=precise enables ffp-contract=fast as a side effect + // the FPContract value has already been set to a string literal + // and the Val string isn't a pertinent value. ; } else if (Val.equals("fast") || Val.equals("on") || Val.equals("off")) FPContract = Val; @@ -2774,7 +2776,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, // -fno_fast_math restores default denormal and fpcontract handling DenormalFPMath = DefaultDenormalFPMath; DenormalFP32Math = DefaultDenormalFP32Math; - FPContract = "on"; + FPContract = ""; break; } if (StrictFPModel) { @@ -2784,7 +2786,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, if (HonorINFs && HonorNaNs && !AssociativeMath && !ReciprocalMath && SignedZeros && TrappingMath && RoundingFPMath && - FPContract.equals("off")) + (FPContract.equals("off") || FPContract.empty())) // OK: Current Arg doesn't conflict with -ffp-model=strict ; else { diff --git a/clang/lib/Driver/ToolChains/CloudABI.cpp b/clang/lib/Driver/ToolChains/CloudABI.cpp index 77672a99d989c..0602e4f6d0b3d 100644 --- a/clang/lib/Driver/ToolChains/CloudABI.cpp +++ b/clang/lib/Driver/ToolChains/CloudABI.cpp @@ -75,7 +75,7 @@ void cloudabi::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); - AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0], + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index f082bf8ce98b5..ebdb22fae3963 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -361,7 +361,7 @@ bool tools::isUseSeparateSections(const llvm::Triple &Triple) { return Triple.getOS() == llvm::Triple::CloudABI; } -void tools::AddGoldPlugin(const ToolChain &ToolChain, const ArgList &Args, +void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfo &Input, bool IsThinLTO) { const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath()); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index bf1ab8153de78..984f3ee98af1e 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -58,7 +58,7 @@ void SplitDebugInfo(const ToolChain &TC, Compilation &C, const Tool &T, const JobAction &JA, const llvm::opt::ArgList &Args, const InputInfo &Output, const char *OutFile); -void AddGoldPlugin(const ToolChain &ToolChain, const llvm::opt::ArgList &Args, +void addLTOOptions(const ToolChain &ToolChain, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output, const InputInfo &Input, bool IsThinLTO); diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index c5c6f530f48c0..3f3d6e7c72eb2 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -275,7 +275,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); - AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0], + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp index 1e1f003daf831..6114829ac8e18 100644 --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp @@ -111,7 +111,7 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); - AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0], + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index bffaa12cbd8cb..f5ff6795c1f6b 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -544,7 +544,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); - AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0], + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } @@ -2151,6 +2151,7 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes( static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu", "riscv64-linux-gnu", "riscv64-unknown-elf", + "riscv64-redhat-linux", "riscv64-suse-linux"}; static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"}; diff --git a/clang/lib/Index/IndexSymbol.cpp b/clang/lib/Index/IndexSymbol.cpp index ae9134bf11826..0d2e557cdd367 100644 --- a/clang/lib/Index/IndexSymbol.cpp +++ b/clang/lib/Index/IndexSymbol.cpp @@ -357,6 +357,15 @@ SymbolInfo index::getSymbolInfo(const Decl *D) { case Decl::VarTemplate: llvm_unreachable("variables handled before"); break; + case Decl::TemplateTypeParm: + Info.Kind = SymbolKind::TemplateTypeParm; + break; + case Decl::TemplateTemplateParm: + Info.Kind = SymbolKind::TemplateTemplateParm; + break; + case Decl::NonTypeTemplateParm: + Info.Kind = SymbolKind::NonTypeTemplateParm; + break; // Other decls get the 'unknown' kind. default: break; @@ -517,6 +526,9 @@ StringRef index::getSymbolKindString(SymbolKind K) { case SymbolKind::ConversionFunction: return "conversion-func"; case SymbolKind::Parameter: return "param"; case SymbolKind::Using: return "using"; + case SymbolKind::TemplateTypeParm: return "template-type-param"; + case SymbolKind::TemplateTemplateParm: return "template-template-param"; + case SymbolKind::NonTypeTemplateParm: return "non-type-template-param"; } llvm_unreachable("invalid symbol kind"); } diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index fbabe92977c9d..e1bcbdb05499b 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -19,6 +19,7 @@ #include "clang/Sema/Scope.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/UniqueVector.h" +#include "llvm/Frontend/OpenMP/OMPContext.h" using namespace clang; using namespace llvm::omp; @@ -810,10 +811,225 @@ Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr, LinModifiers, Steps, SourceRange(Loc, EndLoc)); } +namespace { +/// Constant used in the diagnostics to distinguish the levels in an OpenMP +/// contexts: selector-set={selector(trait, ...), ...}, .... +enum OMPContextLvl { + CONTEXT_SELECTOR_SET_LVL = 0, + CONTEXT_SELECTOR_LVL = 1, + CONTEXT_TRAIT_LVL = 2, +}; + +static StringRef stringLiteralParser(Parser &P) { + ExprResult Res = P.ParseStringLiteralExpression(true); + return Res.isUsable() ? Res.getAs()->getString() : ""; +} + +static StringRef getNameFromIdOrString(Parser &P, Token &Tok, + OMPContextLvl Lvl) { + if (Tok.is(tok::identifier)) { + llvm::SmallString<16> Buffer; + StringRef Name = P.getPreprocessor().getSpelling(Tok, Buffer); + (void)P.ConsumeToken(); + return Name; + } + + if (tok::isStringLiteral(Tok.getKind())) + return stringLiteralParser(P); + + P.Diag(Tok.getLocation(), + diag::warn_omp_declare_variant_string_literal_or_identifier) + << Lvl; + return ""; +} + +static bool checkForDuplicates(Parser &P, StringRef Name, + SourceLocation NameLoc, + llvm::StringMap &Seen, + OMPContextLvl Lvl) { + auto Res = Seen.try_emplace(Name, NameLoc); + if (Res.second) + return false; + + // Each trait-set-selector-name, trait-selector-name and trait-name can + // only be specified once. + P.Diag(NameLoc, diag::warn_omp_declare_variant_ctx_mutiple_use) + << Lvl << Name; + P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here) + << Lvl << Name; + return true; +} +} // namespace + +void Parser::parseOMPTraitPropertyKind( + OMPTraitInfo::OMPTraitProperty &TIProperty, llvm::omp::TraitSet Set, + llvm::omp::TraitSelector Selector, llvm::StringMap &Seen) { + TIProperty.Kind = TraitProperty::invalid; + + SourceLocation NameLoc = Tok.getLocation(); + StringRef Name = + getNameFromIdOrString(*this, Tok, CONTEXT_TRAIT_LVL); + if (Name.empty()) { + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options) + << CONTEXT_TRAIT_LVL << listOpenMPContextTraitProperties(Set, Selector); + return; + } + + TIProperty.Kind = getOpenMPContextTraitPropertyKind(Set, Name); + if (TIProperty.Kind != TraitProperty::invalid) { + if (checkForDuplicates(*this, Name, NameLoc, Seen, CONTEXT_TRAIT_LVL)) + TIProperty.Kind = TraitProperty::invalid; + return; + } + + // It follows diagnosis and helping notes. + // FIXME: We should move the diagnosis string generation into libFrontend. + Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_property) + << Name << getOpenMPContextTraitSelectorName(Selector) + << getOpenMPContextTraitSetName(Set); + + TraitSet SetForName = getOpenMPContextTraitSetKind(Name); + if (SetForName != TraitSet::invalid) { + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_SELECTOR_SET_LVL << CONTEXT_TRAIT_LVL; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << Name << "" + << "()"; + return; + } + TraitSelector SelectorForName = getOpenMPContextTraitSelectorKind(Name); + if (SelectorForName != TraitSelector::invalid) { + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_SELECTOR_LVL << CONTEXT_TRAIT_LVL; + bool AllowsTraitScore = false; + bool RequiresProperty = false; + isValidTraitSelectorForTraitSet( + SelectorForName, getOpenMPContextTraitSetForSelector(SelectorForName), + AllowsTraitScore, RequiresProperty); + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForSelector(SelectorForName)) + << Name << (RequiresProperty ? "()" : ""); + return; + } + for (const auto &PotentialSet : + {TraitSet::construct, TraitSet::user, TraitSet::implementation, + TraitSet::device}) { + TraitProperty PropertyForName = + getOpenMPContextTraitPropertyKind(PotentialSet, Name); + if (PropertyForName == TraitProperty::invalid) + continue; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForProperty(PropertyForName)) + << getOpenMPContextTraitSelectorName( + getOpenMPContextTraitSelectorForProperty(PropertyForName)) + << ("(" + Name + ")").str(); + return; + } + Diag(NameLoc, diag::note_omp_declare_variant_ctx_options) + << CONTEXT_TRAIT_LVL << listOpenMPContextTraitProperties(Set, Selector); +} + +void Parser::parseOMPContextProperty(OMPTraitInfo::OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &Seen) { + assert(TISelector.Kind != TraitSelector::user_condition && + "User conditions are special properties not handled here!"); + + SourceLocation PropertyLoc = Tok.getLocation(); + OMPTraitInfo::OMPTraitProperty TIProperty; + parseOMPTraitPropertyKind(TIProperty, Set, TISelector.Kind, Seen); + + // If we have an invalid property here we already issued a warning. + if (TIProperty.Kind == TraitProperty::invalid) { + if (PropertyLoc != Tok.getLocation()) + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here) + << CONTEXT_TRAIT_LVL; + return; + } + + if (isValidTraitPropertyForTraitSetAndSelector(TIProperty.Kind, + TISelector.Kind, Set)) { + // If we make it here the property, selector, set, score, condition, ... are + // all valid (or have been corrected). Thus we can record the property. + TISelector.Properties.push_back(TIProperty); + return; + } + + Diag(PropertyLoc, diag::warn_omp_ctx_incompatible_property_for_selector) + << getOpenMPContextTraitPropertyName(TIProperty.Kind) + << getOpenMPContextTraitSelectorName(TISelector.Kind) + << getOpenMPContextTraitSetName(Set); + Diag(PropertyLoc, diag::note_omp_ctx_compatible_set_and_selector_for_property) + << getOpenMPContextTraitPropertyName(TIProperty.Kind) + << getOpenMPContextTraitSelectorName( + getOpenMPContextTraitSelectorForProperty(TIProperty.Kind)) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForProperty(TIProperty.Kind)); + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here) + << CONTEXT_TRAIT_LVL; +} + +void Parser::parseOMPTraitSelectorKind( + OMPTraitInfo::OMPTraitSelector &TISelector, llvm::omp::TraitSet Set, + llvm::StringMap &Seen) { + TISelector.Kind = TraitSelector::invalid; + + SourceLocation NameLoc = Tok.getLocation(); + StringRef Name = getNameFromIdOrString(*this, Tok, CONTEXT_SELECTOR_LVL + ); + if (Name.empty()) { + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options) + << CONTEXT_SELECTOR_LVL << listOpenMPContextTraitSelectors(Set); + return; + } + + TISelector.Kind = getOpenMPContextTraitSelectorKind(Name); + if (TISelector.Kind != TraitSelector::invalid) { + if (checkForDuplicates(*this, Name, NameLoc, Seen, CONTEXT_SELECTOR_LVL)) + TISelector.Kind = TraitSelector::invalid; + return; + } + + // It follows diagnosis and helping notes. + Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_selector) + << Name << getOpenMPContextTraitSetName(Set); + + TraitSet SetForName = getOpenMPContextTraitSetKind(Name); + if (SetForName != TraitSet::invalid) { + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_SELECTOR_SET_LVL << CONTEXT_SELECTOR_LVL; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << Name << "" + << ""; + return; + } + for (const auto &PotentialSet : + {TraitSet::construct, TraitSet::user, TraitSet::implementation, + TraitSet::device}) { + TraitProperty PropertyForName = + getOpenMPContextTraitPropertyKind(PotentialSet, Name); + if (PropertyForName == TraitProperty::invalid) + continue; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_TRAIT_LVL << CONTEXT_SELECTOR_LVL; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForProperty(PropertyForName)) + << getOpenMPContextTraitSelectorName( + getOpenMPContextTraitSelectorForProperty(PropertyForName)) + << ("(" + Name + ")").str(); + return; + } + Diag(NameLoc, diag::note_omp_declare_variant_ctx_options) + << CONTEXT_SELECTOR_LVL << listOpenMPContextTraitSelectors(Set); +} + /// Parse optional 'score' '(' ')' ':'. static ExprResult parseContextScore(Parser &P) { ExprResult ScoreExpr; - Sema::OMPCtxStringType Buffer; + llvm::SmallString<16> Buffer; StringRef SelectorName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer); if (!SelectorName.equals("score")) @@ -825,246 +1041,266 @@ static ExprResult parseContextScore(Parser &P) { if (P.getCurToken().is(tok::colon)) (void)P.ConsumeAnyToken(); else - P.Diag(P.getCurToken(), diag::warn_pragma_expected_colon) - << "context selector score clause"; + P.Diag(P.getCurToken(), diag::warn_omp_declare_variant_expected) + << "':'" + << "score expression"; return ScoreExpr; } -/// Parse context selector for 'implementation' selector set: -/// 'vendor' '(' [ 'score' '(' ')' ':' ] { ',' } -/// ')' -static void -parseImplementationSelector(Parser &P, SourceLocation Loc, - llvm::StringMap &UsedCtx, - SmallVectorImpl &Data) { - const Token &Tok = P.getCurToken(); - // Parse inner context selector set name, if any. - if (!Tok.is(tok::identifier)) { - P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected) - << "implementation"; - // Skip until either '}', ')', or end of directive. - while (!P.SkipUntil(tok::r_brace, tok::r_paren, - tok::annot_pragma_openmp_end, Parser::StopBeforeMatch)) - ; - return; - } - Sema::OMPCtxStringType Buffer; - StringRef CtxSelectorName = P.getPreprocessor().getSpelling(Tok, Buffer); - auto Res = UsedCtx.try_emplace(CtxSelectorName, Tok.getLocation()); - if (!Res.second) { - // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions. - // Each trait-selector-name can only be specified once. - P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_mutiple_use) - << CtxSelectorName << "implementation"; - P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here) - << CtxSelectorName; - } - OpenMPContextSelectorKind CSKind = getOpenMPContextSelector(CtxSelectorName); - (void)P.ConsumeToken(); - switch (CSKind) { - case OMP_CTX_vendor: { - // Parse '('. - BalancedDelimiterTracker T(P, tok::l_paren, tok::annot_pragma_openmp_end); - (void)T.expectAndConsume(diag::err_expected_lparen_after, - CtxSelectorName.data()); - ExprResult Score = parseContextScore(P); - llvm::UniqueVector Vendors; - do { - // Parse . - StringRef VendorName; - if (Tok.is(tok::identifier)) { - Buffer.clear(); - VendorName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer); - (void)P.ConsumeToken(); - if (!VendorName.empty()) - Vendors.insert(VendorName); - } else { - P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_item_expected) - << "vendor identifier" - << "vendor" - << "implementation"; +/// Parses an OpenMP context selector. +/// +/// ['('[] [, ]* ')'] +void Parser::parseOMPContextSelector( + OMPTraitInfo::OMPTraitSelector &TISelector, llvm::omp::TraitSet Set, + llvm::StringMap &SeenSelectors) { + unsigned short OuterPC = ParenCount; + + // If anything went wrong we issue an error or warning and then skip the rest + // of the selector. However, commas are ambiguous so we look for the nesting + // of parentheses here as well. + auto FinishSelector = [OuterPC, this]() -> void { + bool Done = false; + while (!Done) { + while (!SkipUntil({tok::r_brace, tok::r_paren, tok::comma, + tok::annot_pragma_openmp_end}, + StopBeforeMatch)) + ; + if (Tok.is(tok::r_paren) && OuterPC > ParenCount) + (void)ConsumeParen(); + if (OuterPC <= ParenCount) { + Done = true; + break; } - if (!P.TryConsumeToken(tok::comma) && Tok.isNot(tok::r_paren)) { - P.Diag(Tok, diag::err_expected_punc) - << (VendorName.empty() ? "vendor name" : VendorName); + if (!Tok.is(tok::comma) && !Tok.is(tok::r_paren)) { + Done = true; + break; } - } while (Tok.is(tok::identifier)); - // Parse ')'. - (void)T.consumeClose(); - if (!Vendors.empty()) - Data.emplace_back(OMP_CTX_SET_implementation, CSKind, Score, Vendors); - break; + (void)ConsumeAnyToken(); + } + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here) + << CONTEXT_SELECTOR_LVL; + }; + + SourceLocation SelectorLoc = Tok.getLocation(); + parseOMPTraitSelectorKind(TISelector, Set, SeenSelectors); + if (TISelector.Kind == TraitSelector::invalid) + return FinishSelector(); + + bool AllowsTraitScore = false; + bool RequiresProperty = false; + if (!isValidTraitSelectorForTraitSet(TISelector.Kind, Set, AllowsTraitScore, + RequiresProperty)) { + Diag(SelectorLoc, diag::warn_omp_ctx_incompatible_selector_for_set) + << getOpenMPContextTraitSelectorName(TISelector.Kind) + << getOpenMPContextTraitSetName(Set); + Diag(SelectorLoc, diag::note_omp_ctx_compatible_set_for_selector) + << getOpenMPContextTraitSelectorName(TISelector.Kind) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForSelector(TISelector.Kind)) + << RequiresProperty; + return FinishSelector(); + } + + if (!RequiresProperty) { + TISelector.Properties.push_back( + {getOpenMPContextTraitPropertyForSelector(TISelector.Kind)}); + return; } - case OMP_CTX_kind: - case OMP_CTX_unknown: - P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected) - << "implementation"; - // Skip until either '}', ')', or end of directive. - while (!P.SkipUntil(tok::r_brace, tok::r_paren, - tok::annot_pragma_openmp_end, Parser::StopBeforeMatch)) - ; + + if (!Tok.is(tok::l_paren)) { + Diag(SelectorLoc, diag::warn_omp_ctx_selector_without_properties) + << getOpenMPContextTraitSelectorName(TISelector.Kind) + << getOpenMPContextTraitSetName(Set); + return FinishSelector(); + } + + if (TISelector.Kind == TraitSelector::user_condition) { + SourceLocation RLoc; + ExprResult Condition = ParseOpenMPParensExpr("user condition", RLoc); + if (!Condition.isUsable()) + return FinishSelector(); + TISelector.ScoreOrCondition = Condition.get(); + TISelector.Properties.push_back({TraitProperty::user_condition_unknown}); return; } + + BalancedDelimiterTracker BDT(*this, tok::l_paren, + tok::annot_pragma_openmp_end); + // Parse '('. + (void)BDT.consumeOpen(); + + ExprResult Score = parseContextScore(*this); + + if (!AllowsTraitScore && Score.isUsable()) { + Diag(Score.get()->getBeginLoc(), + diag::warn_omp_ctx_incompatible_score_for_property) + << getOpenMPContextTraitSelectorName(TISelector.Kind) + << getOpenMPContextTraitSetName(Set) << Score.get(); + Score = ExprResult(); + } + + if (Score.isUsable()) + TISelector.ScoreOrCondition = Score.get(); + + llvm::StringMap SeenProperties; + do { + parseOMPContextProperty(TISelector, Set, SeenProperties); + } while (TryConsumeToken(tok::comma)); + + // Parse ')'. + BDT.consumeClose(); } -/// Parse context selector for 'device' selector set: -/// 'kind' '(' { ',' } ')' -static void -parseDeviceSelector(Parser &P, SourceLocation Loc, - llvm::StringMap &UsedCtx, - SmallVectorImpl &Data) { - const Token &Tok = P.getCurToken(); - // Parse inner context selector set name, if any. - if (!Tok.is(tok::identifier)) { - P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected) - << "device"; - // Skip until either '}', ')', or end of directive. - while (!P.SkipUntil(tok::r_brace, tok::r_paren, - tok::annot_pragma_openmp_end, Parser::StopBeforeMatch)) - ; +void Parser::parseOMPTraitSetKind(OMPTraitInfo::OMPTraitSet &TISet, + llvm::StringMap &Seen) { + TISet.Kind = TraitSet::invalid; + + SourceLocation NameLoc = Tok.getLocation(); + StringRef Name = getNameFromIdOrString(*this, Tok, CONTEXT_SELECTOR_SET_LVL + ); + if (Name.empty()) { + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options) + << CONTEXT_SELECTOR_SET_LVL << listOpenMPContextTraitSets(); return; } - Sema::OMPCtxStringType Buffer; - StringRef CtxSelectorName = P.getPreprocessor().getSpelling(Tok, Buffer); - auto Res = UsedCtx.try_emplace(CtxSelectorName, Tok.getLocation()); - if (!Res.second) { - // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions. - // Each trait-selector-name can only be specified once. - P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_mutiple_use) - << CtxSelectorName << "device"; - P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here) - << CtxSelectorName; - } - OpenMPContextSelectorKind CSKind = getOpenMPContextSelector(CtxSelectorName); - (void)P.ConsumeToken(); - switch (CSKind) { - case OMP_CTX_kind: { - // Parse '('. - BalancedDelimiterTracker T(P, tok::l_paren, tok::annot_pragma_openmp_end); - (void)T.expectAndConsume(diag::err_expected_lparen_after, - CtxSelectorName.data()); - llvm::UniqueVector Kinds; - do { - // Parse . - StringRef KindName; - if (Tok.is(tok::identifier)) { - Buffer.clear(); - KindName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer); - SourceLocation SLoc = P.getCurToken().getLocation(); - (void)P.ConsumeToken(); - if (llvm::StringSwitch(KindName) - .Case("host", false) - .Case("nohost", false) - .Case("cpu", false) - .Case("gpu", false) - .Case("fpga", false) - .Default(true)) { - P.Diag(SLoc, diag::err_omp_wrong_device_kind_trait) << KindName; - } else { - Kinds.insert(KindName); - } - } else { - P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_item_expected) - << "'host', 'nohost', 'cpu', 'gpu', or 'fpga'" - << "kind" - << "device"; + + TISet.Kind = getOpenMPContextTraitSetKind(Name); + if (TISet.Kind != TraitSet::invalid) { + if (checkForDuplicates(*this, Name, NameLoc, Seen, + CONTEXT_SELECTOR_SET_LVL)) + TISet.Kind = TraitSet::invalid; + return; + } + + // It follows diagnosis and helping notes. + Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_set) << Name; + + TraitSelector SelectorForName = getOpenMPContextTraitSelectorKind(Name); + if (SelectorForName != TraitSelector::invalid) { + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_SELECTOR_LVL << CONTEXT_SELECTOR_SET_LVL; + bool AllowsTraitScore = false; + bool RequiresProperty = false; + isValidTraitSelectorForTraitSet( + SelectorForName, getOpenMPContextTraitSetForSelector(SelectorForName), + AllowsTraitScore, RequiresProperty); + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForSelector(SelectorForName)) + << Name << (RequiresProperty ? "()" : ""); + return; + } + for (const auto &PotentialSet : + {TraitSet::construct, TraitSet::user, TraitSet::implementation, + TraitSet::device}) { + TraitProperty PropertyForName = + getOpenMPContextTraitPropertyKind(PotentialSet, Name); + if (PropertyForName == TraitProperty::invalid) + continue; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) + << Name << CONTEXT_TRAIT_LVL << CONTEXT_SELECTOR_SET_LVL; + Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) + << getOpenMPContextTraitSetName( + getOpenMPContextTraitSetForProperty(PropertyForName)) + << getOpenMPContextTraitSelectorName( + getOpenMPContextTraitSelectorForProperty(PropertyForName)) + << ("(" + Name + ")").str(); + return; + } + Diag(NameLoc, diag::note_omp_declare_variant_ctx_options) + << CONTEXT_SELECTOR_SET_LVL << listOpenMPContextTraitSets(); +} + +/// Parses an OpenMP context selector set. +/// +/// '=' '{' [, ]* '}' +void Parser::parseOMPContextSelectorSet( + OMPTraitInfo::OMPTraitSet &TISet, + llvm::StringMap &SeenSets) { + auto OuterBC = BraceCount; + + // If anything went wrong we issue an error or warning and then skip the rest + // of the set. However, commas are ambiguous so we look for the nesting + // of braces here as well. + auto FinishSelectorSet = [this, OuterBC]() -> void { + bool Done = false; + while (!Done) { + while (!SkipUntil({tok::comma, tok::r_brace, tok::r_paren, + tok::annot_pragma_openmp_end}, + StopBeforeMatch)) + ; + if (Tok.is(tok::r_brace) && OuterBC > BraceCount) + (void)ConsumeBrace(); + if (OuterBC <= BraceCount) { + Done = true; + break; } - if (!P.TryConsumeToken(tok::comma) && Tok.isNot(tok::r_paren)) { - P.Diag(Tok, diag::err_expected_punc) - << (KindName.empty() ? "kind of device" : KindName); + if (!Tok.is(tok::comma) && !Tok.is(tok::r_brace)) { + Done = true; + break; } - } while (Tok.is(tok::identifier)); - // Parse ')'. - (void)T.consumeClose(); - if (!Kinds.empty()) - Data.emplace_back(OMP_CTX_SET_device, CSKind, ExprResult(), Kinds); - break; + (void)ConsumeAnyToken(); + } + Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here) + << CONTEXT_SELECTOR_SET_LVL; + }; + + parseOMPTraitSetKind(TISet, SeenSets); + if (TISet.Kind == TraitSet::invalid) + return FinishSelectorSet(); + + // Parse '='. + if (!TryConsumeToken(tok::equal)) + Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected) + << "=" + << ("context set name \"" + getOpenMPContextTraitSetName(TISet.Kind) + + "\"") + .str(); + + // Parse '{'. + if (Tok.is(tok::l_brace)) { + (void)ConsumeBrace(); + } else { + Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected) + << "{" + << ("'=' that follows the context set name \"" + + getOpenMPContextTraitSetName(TISet.Kind) + "\"") + .str(); } - case OMP_CTX_vendor: - case OMP_CTX_unknown: - P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected) - << "device"; - // Skip until either '}', ')', or end of directive. - while (!P.SkipUntil(tok::r_brace, tok::r_paren, - tok::annot_pragma_openmp_end, Parser::StopBeforeMatch)) - ; - return; + + llvm::StringMap SeenSelectors; + do { + OMPTraitInfo::OMPTraitSelector TISelector; + parseOMPContextSelector(TISelector, TISet.Kind, SeenSelectors); + if (TISelector.Kind != TraitSelector::invalid && + !TISelector.Properties.empty()) + TISet.Selectors.push_back(TISelector); + } while (TryConsumeToken(tok::comma)); + + // Parse '}'. + if (Tok.is(tok::r_brace)) { + (void)ConsumeBrace(); + } else { + Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected) + << "}" + << ("context selectors for the context set \"" + + getOpenMPContextTraitSetName(TISet.Kind) + "\"") + .str(); } } -/// Parses clauses for 'declare variant' directive. -/// clause: -/// '=' '{' '}' -/// [ ',' '=' '{' '}' ] -bool Parser::parseOpenMPContextSelectors( - SourceLocation Loc, SmallVectorImpl &Data) { - llvm::StringMap UsedCtxSets; +/// Parse OpenMP context selectors: +/// +/// [, ]* +bool Parser::parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI) { + llvm::StringMap SeenSets; do { - // Parse inner context selector set name. - if (!Tok.is(tok::identifier)) { - Diag(Tok.getLocation(), diag::err_omp_declare_variant_no_ctx_selector) - << getOpenMPClauseName(OMPC_match); - return true; - } - Sema::OMPCtxStringType Buffer; - StringRef CtxSelectorSetName = PP.getSpelling(Tok, Buffer); - auto Res = UsedCtxSets.try_emplace(CtxSelectorSetName, Tok.getLocation()); - if (!Res.second) { - // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions. - // Each trait-set-selector-name can only be specified once. - Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_set_mutiple_use) - << CtxSelectorSetName; - Diag(Res.first->getValue(), - diag::note_omp_declare_variant_ctx_set_used_here) - << CtxSelectorSetName; - } - // Parse '='. - (void)ConsumeToken(); - if (Tok.isNot(tok::equal)) { - Diag(Tok.getLocation(), diag::err_omp_declare_variant_equal_expected) - << CtxSelectorSetName; - return true; - } - (void)ConsumeToken(); - // TBD: add parsing of known context selectors. - // Unknown selector - just ignore it completely. - { - // Parse '{'. - BalancedDelimiterTracker TBr(*this, tok::l_brace, - tok::annot_pragma_openmp_end); - if (TBr.expectAndConsume(diag::err_expected_lbrace_after, "=")) - return true; - OpenMPContextSelectorSetKind CSSKind = - getOpenMPContextSelectorSet(CtxSelectorSetName); - llvm::StringMap UsedCtx; - do { - switch (CSSKind) { - case OMP_CTX_SET_implementation: - parseImplementationSelector(*this, Loc, UsedCtx, Data); - break; - case OMP_CTX_SET_device: - parseDeviceSelector(*this, Loc, UsedCtx, Data); - break; - case OMP_CTX_SET_unknown: - // Skip until either '}', ')', or end of directive. - while (!SkipUntil(tok::r_brace, tok::r_paren, - tok::annot_pragma_openmp_end, StopBeforeMatch)) - ; - break; - } - const Token PrevTok = Tok; - if (!TryConsumeToken(tok::comma) && Tok.isNot(tok::r_brace)) - Diag(Tok, diag::err_omp_expected_comma_brace) - << (PrevTok.isAnnotation() ? "context selector trait" - : PP.getSpelling(PrevTok)); - } while (Tok.is(tok::identifier)); - // Parse '}'. - (void)TBr.consumeClose(); - } - // Consume ',' - if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)) - (void)ExpectAndConsume(tok::comma); - } while (Tok.isAnyIdentifier()); + OMPTraitInfo::OMPTraitSet TISet; + parseOMPContextSelectorSet(TISet, SeenSets); + if (TISet.Kind != TraitSet::invalid && !TISet.Selectors.empty()) + TI.Sets.push_back(TISet); + } while (TryConsumeToken(tok::comma)); + return false; } @@ -1102,9 +1338,6 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, (void)ConsumeAnnotationToken(); return; } - Optional> DeclVarData = - Actions.checkOpenMPDeclareVariantFunction( - Ptr, AssociatedFunction.get(), SourceRange(Loc, Tok.getLocation())); // Parse 'match'. OpenMPClauseKind CKind = Tok.isAnnotation() @@ -1132,24 +1365,27 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, } // Parse inner context selectors. - SmallVector Data; - if (!parseOpenMPContextSelectors(Loc, Data)) { - // Parse ')'. - (void)T.consumeClose(); - // Need to check for extra tokens. - if (Tok.isNot(tok::annot_pragma_openmp_end)) { - Diag(Tok, diag::warn_omp_extra_tokens_at_eol) - << getOpenMPDirectiveName(OMPD_declare_variant); - } - } + OMPTraitInfo *TI = new OMPTraitInfo(); + parseOMPContextSelectors(Loc, *TI); + + // Parse ')' + (void)T.consumeClose(); + + Optional> DeclVarData = + Actions.checkOpenMPDeclareVariantFunction( + Ptr, AssociatedFunction.get(), *TI, + SourceRange(Loc, Tok.getLocation())); // Skip last tokens. while (Tok.isNot(tok::annot_pragma_openmp_end)) ConsumeAnyToken(); - if (DeclVarData.hasValue()) + if (DeclVarData.hasValue() && !TI->Sets.empty()) Actions.ActOnOpenMPDeclareVariantDirective( - DeclVarData.getValue().first, DeclVarData.getValue().second, - SourceRange(Loc, Tok.getLocation()), Data); + DeclVarData.getValue().first, DeclVarData.getValue().second, TI, + SourceRange(Loc, Tok.getLocation())); + else + delete TI; + // Skip the last annot_pragma_openmp_end. (void)ConsumeAnnotationToken(); } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 9feee9dac02d7..1c396c8b66fca 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -14157,11 +14157,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, // Warn if K&R function is defined without a previous declaration. // This warning is issued only if the definition itself does not provide // a prototype. Only K&R definitions do not provide a prototype. - // An empty list in a function declarator that is part of a definition - // of that function specifies that the function has no parameters - // (C99 6.7.5.3p14) - if (!FD->hasWrittenPrototype() && FD->getNumParams() > 0 && - !LangOpts.CPlusPlus) { + if (!FD->hasWrittenPrototype()) { TypeSourceInfo *TI = FD->getTypeSourceInfo(); TypeLoc TL = TI->getTypeLoc(); FunctionTypeLoc FTL = TL.getAsAdjusted(); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 5c79eb26394e0..9b3f5d87742e1 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -5369,7 +5369,8 @@ static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto, Optional> Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, - Expr *VariantRef, SourceRange SR) { + Expr *VariantRef, OMPTraitInfo &TI, + SourceRange SR) { if (!DG || DG.get().isNull()) return None; @@ -5422,12 +5423,41 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, return None; } + auto ShouldDelayChecks = [](Expr *&E, bool) { + return E && (E->isTypeDependent() || E->isValueDependent() || + E->containsUnexpandedParameterPack() || + E->isInstantiationDependent()); + }; // Do not check templates, wait until instantiation. - if (VariantRef->isTypeDependent() || VariantRef->isValueDependent() || - VariantRef->containsUnexpandedParameterPack() || - VariantRef->isInstantiationDependent() || FD->isDependentContext()) + if (FD->isDependentContext() || ShouldDelayChecks(VariantRef, false) || + TI.anyScoreOrCondition(ShouldDelayChecks)) return std::make_pair(FD, VariantRef); + // Deal with non-constant score and user condition expressions. + auto HandleNonConstantScoresAndConditions = [this](Expr *&E, + bool IsScore) -> bool { + llvm::APSInt Result; + if (!E || E->isIntegerConstantExpr(Result, Context)) + return false; + + if (IsScore) { + // We warn on non-constant scores and pretend they were not present. + Diag(E->getExprLoc(), diag::warn_omp_declare_variant_score_not_constant) + << E; + E = nullptr; + } else { + // We could replace a non-constant user condition with "false" but we + // will soon need to handle these anyway for the dynamic version of + // OpenMP context selectors. + Diag(E->getExprLoc(), + diag::err_omp_declare_variant_user_condition_not_constant) + << E; + } + return true; + }; + if (TI.anyScoreOrCondition(HandleNonConstantScoresAndConditions)) + return None; + // Convert VariantRef expression to the type of the original function to // resolve possible conflicts. ExprResult VariantRefCast; @@ -5600,75 +5630,13 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, return std::make_pair(FD, cast(DRE)); } -void Sema::ActOnOpenMPDeclareVariantDirective( - FunctionDecl *FD, Expr *VariantRef, SourceRange SR, - ArrayRef Data) { - if (Data.empty()) - return; - SmallVector CtxScores; - SmallVector CtxSets; - SmallVector Ctxs; - SmallVector ImplVendors, DeviceKinds; - bool IsError = false; - for (const OMPCtxSelectorData &D : Data) { - OpenMPContextSelectorSetKind CtxSet = D.CtxSet; - OpenMPContextSelectorKind Ctx = D.Ctx; - if (CtxSet == OMP_CTX_SET_unknown || Ctx == OMP_CTX_unknown) - return; - Expr *Score = nullptr; - if (D.Score.isUsable()) { - Score = D.Score.get(); - if (!Score->isTypeDependent() && !Score->isValueDependent() && - !Score->isInstantiationDependent() && - !Score->containsUnexpandedParameterPack()) { - Score = - PerformOpenMPImplicitIntegerConversion(Score->getExprLoc(), Score) - .get(); - if (Score) - Score = VerifyIntegerConstantExpression(Score).get(); - } - } else { - // OpenMP 5.0, 2.3.3 Matching and Scoring Context Selectors. - // The kind, arch, and isa selectors are given the values 2^l, 2^(l+1) and - // 2^(l+2), respectively, where l is the number of traits in the construct - // set. - // TODO: implement correct logic for isa and arch traits. - // TODO: take the construct context set into account when it is - // implemented. - int L = 0; // Currently set the number of traits in construct set to 0, - // since the construct trait set in not supported yet. - if (CtxSet == OMP_CTX_SET_device && Ctx == OMP_CTX_kind) - Score = ActOnIntegerConstant(SourceLocation(), std::pow(2, L)).get(); - else - Score = ActOnIntegerConstant(SourceLocation(), 0).get(); - } - switch (Ctx) { - case OMP_CTX_vendor: - assert(CtxSet == OMP_CTX_SET_implementation && - "Expected implementation context selector set."); - ImplVendors.append(D.Names.begin(), D.Names.end()); - break; - case OMP_CTX_kind: - assert(CtxSet == OMP_CTX_SET_device && - "Expected device context selector set."); - DeviceKinds.append(D.Names.begin(), D.Names.end()); - break; - case OMP_CTX_unknown: - llvm_unreachable("Unknown context selector kind."); - } - IsError = IsError || !Score; - CtxSets.push_back(CtxSet); - Ctxs.push_back(Ctx); - CtxScores.push_back(Score); - } - if (!IsError) { - auto *NewAttr = OMPDeclareVariantAttr::CreateImplicit( - Context, VariantRef, CtxScores.begin(), CtxScores.size(), - CtxSets.begin(), CtxSets.size(), Ctxs.begin(), Ctxs.size(), - ImplVendors.begin(), ImplVendors.size(), DeviceKinds.begin(), - DeviceKinds.size(), SR); - FD->addAttr(NewAttr); - } +void Sema::ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD, + Expr *VariantRef, + OMPTraitInfo *TI, + SourceRange SR) { + auto *NewAttr = + OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, TI, SR); + FD->addAttr(NewAttr); } void Sema::markOpenMPDeclareVariantFuncsReferenced(SourceLocation Loc, @@ -10481,7 +10449,6 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective( CS->getCapturedDecl()->setNothrow(); } - OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 0ccd188f58e43..f059de5ee4219 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -394,50 +394,43 @@ static void instantiateOMPDeclareVariantAttr( VariantFuncRef = Subst(E); } + // Copy the template version of the OMPTraitInfo and run substitute on all + // score and condition expressiosn. + OMPTraitInfo *TI = new OMPTraitInfo(); + *TI = *Attr.getTraitInfos(); + + // Try to substitute template parameters in score and condition expressions. + auto SubstScoreOrConditionExpr = [&S, Subst](Expr *&E, bool) { + if (E) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + ExprResult ER = Subst(E); + if (ER.isUsable()) + E = ER.get(); + else + return true; + } + return false; + }; + if (TI->anyScoreOrCondition(SubstScoreOrConditionExpr)) { + delete TI; + return; + } + // Check function/variant ref. Optional> DeclVarData = - S.checkOpenMPDeclareVariantFunction( - S.ConvertDeclToDeclGroup(New), VariantFuncRef.get(), Attr.getRange()); - if (!DeclVarData) + S.checkOpenMPDeclareVariantFunction(S.ConvertDeclToDeclGroup(New), + VariantFuncRef.get(), *TI, + Attr.getRange()); + + if (!DeclVarData) { + delete TI; return; - SmallVector Data; - for (unsigned I = 0, E = Attr.scores_size(); I < E; ++I) { - ExprResult Score; - if (Expr *E = *std::next(Attr.scores_begin(), I)) - Score = Subst(E); - // Instantiate the attribute. - auto CtxSet = static_cast( - *std::next(Attr.ctxSelectorSets_begin(), I)); - auto Ctx = static_cast( - *std::next(Attr.ctxSelectors_begin(), I)); - switch (CtxSet) { - case OMP_CTX_SET_implementation: - switch (Ctx) { - case OMP_CTX_vendor: - Data.emplace_back(CtxSet, Ctx, Score, Attr.implVendors()); - break; - case OMP_CTX_kind: - case OMP_CTX_unknown: - llvm_unreachable("Unexpected context selector kind."); - } - break; - case OMP_CTX_SET_device: - switch (Ctx) { - case OMP_CTX_kind: - Data.emplace_back(CtxSet, Ctx, Score, Attr.deviceKinds()); - break; - case OMP_CTX_vendor: - case OMP_CTX_unknown: - llvm_unreachable("Unexpected context selector kind."); - } - break; - case OMP_CTX_SET_unknown: - llvm_unreachable("Unexpected context selector set kind."); - } } + S.ActOnOpenMPDeclareVariantDirective(DeclVarData.getValue().first, - DeclVarData.getValue().second, - Attr.getRange(), Data); + DeclVarData.getValue().second, TI, + Attr.getRange()); } static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr( diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index a1161d2648387..fbd59b9319535 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12612,3 +12612,22 @@ void OMPClauseReader::VisitOMPOrderClause(OMPOrderClause *C) { C->setLParenLoc(Record.readSourceLocation()); C->setKindKwLoc(Record.readSourceLocation()); } + +OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() { + OMPTraitInfo *TI = new OMPTraitInfo(); + TI->Sets.resize(readUInt32()); + for (auto &Set : TI->Sets) { + Set.Kind = readEnum(); + Set.Selectors.resize(readUInt32()); + for (auto &Selector : Set.Selectors) { + Selector.Kind = readEnum(); + Selector.ScoreOrCondition = nullptr; + if (readBool()) + Selector.ScoreOrCondition = readExprRef(); + Selector.Properties.resize(readUInt32()); + for (auto &Property : Selector.Properties) + Property.Kind = readEnum(); + } + } + return TI; +} diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 20a4f78f16e97..45c10be8add72 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2756,6 +2756,8 @@ class AttrReader { return Reader.readVersionTuple(); } + OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } + template T *GetLocalDeclAs(uint32_t LocalID) { return Reader.GetLocalDeclAs(LocalID); } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index f935a69769bf9..018a7386296dc 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6578,3 +6578,19 @@ void OMPClauseWriter::VisitOMPOrderClause(OMPOrderClause *C) { Record.AddSourceLocation(C->getKindKwLoc()); } +void ASTRecordWriter::writeOMPTraitInfo(OMPTraitInfo *TI) { + writeUInt32(TI->Sets.size()); + for (const auto &Set : TI->Sets) { + writeEnum(Set.Kind); + writeUInt32(Set.Selectors.size()); + for (const auto &Selector : Set.Selectors) { + writeEnum(Selector.Kind); + writeBool(Selector.ScoreOrCondition); + if (Selector.ScoreOrCondition) + writeExprRef(Selector.ScoreOrCondition); + writeUInt32(Selector.Properties.size()); + for (const auto &Property : Selector.Properties) + writeEnum(Property.Kind); + } + } +} diff --git a/clang/lib/Tooling/AllTUsExecution.cpp b/clang/lib/Tooling/AllTUsExecution.cpp index d85075f596079..777857a49e81f 100644 --- a/clang/lib/Tooling/AllTUsExecution.cpp +++ b/clang/lib/Tooling/AllTUsExecution.cpp @@ -114,8 +114,7 @@ llvm::Error AllTUsToolExecutor::execute( auto &Action = Actions.front(); { - llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency() - : ThreadCount); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount)); for (std::string File : Files) { Pool.async( [&](std::string Path) { diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp index b4d5a29ca6959..b1b87e7fa5734 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp @@ -106,7 +106,8 @@ DependencyScanningFilesystemSharedCache:: // sharding gives a performance edge by reducing the lock contention. // FIXME: A better heuristic might also consider the OS to account for // the different cost of lock contention on different OSes. - NumShards = std::max(2u, llvm::hardware_concurrency() / 4); + NumShards = + std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4); CacheShards = std::make_unique(NumShards); } diff --git a/clang/test/CodeGen/ppc-emmintrin.c b/clang/test/CodeGen/ppc-emmintrin.c index c14b2dd210f89..631b6c9d2614a 100644 --- a/clang/test/CodeGen/ppc-emmintrin.c +++ b/clang/test/CodeGen/ppc-emmintrin.c @@ -2,9 +2,9 @@ // REQUIRES: powerpc-registered-target // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ -// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ -// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE // CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> , align 16 // CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4 diff --git a/clang/test/CodeGen/ppc-xmmintrin.c b/clang/test/CodeGen/ppc-xmmintrin.c index d7499cbedc48d..e9466b32257f0 100644 --- a/clang/test/CodeGen/ppc-xmmintrin.c +++ b/clang/test/CodeGen/ppc-xmmintrin.c @@ -2,9 +2,9 @@ // REQUIRES: powerpc-registered-target // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ -// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ -// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE #include diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtbegin.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtend.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crti.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtn.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib64/crt1.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib64/crt1.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c index de83e4e4c9130..8bf53f6d997b3 100644 --- a/clang/test/Driver/fp-model.c +++ b/clang/test/Driver/fp-model.c @@ -1,87 +1,85 @@ // Test that incompatible combinations of -ffp-model= options // and other floating point options get a warning diagnostic. +// +// REQUIRES: clang-driver -// RUN: %clang -target x86_64 -### -ffp-model=fast -ffp-contract=off -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=fast -ffp-contract=off -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN %s // WARN: warning: overriding '-ffp-model=fast' option with '-ffp-contract=off' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=fast -ffp-contract=on -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=fast -ffp-contract=on -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN1 %s // WARN1: warning: overriding '-ffp-model=fast' option with '-ffp-contract=on' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fassociative-math -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fassociative-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN2 %s // WARN2: warning: overriding '-ffp-model=strict' option with '-fassociative-math' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -ffast-math -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -ffast-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN3 %s // WARN3: warning: overriding '-ffp-model=strict' option with '-ffast-math' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -ffinite-math-only -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -ffinite-math-only -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN4 %s // WARN4: warning: overriding '-ffp-model=strict' option with '-ffinite-math-only' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN5 %s // WARN5: warning: overriding '-ffp-model=strict' option with '-ffp-contract=fast' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \ -// RUN: | FileCheck --check-prefix=WARN6 %s -// WARN6: warning: overriding '-ffp-model=strict' option with '-ffp-contract=fast' [-Woverriding-t-option] - -// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=on -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -ffp-contract=on -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN7 %s // WARN7: warning: overriding '-ffp-model=strict' option with '-ffp-contract=on' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-honor-infinities -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fno-honor-infinities -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN8 %s // WARN8: warning: overriding '-ffp-model=strict' option with '-fno-honor-infinities' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-honor-nans -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fno-honor-nans -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARN9 %s // WARN9: warning: overriding '-ffp-model=strict' option with '-fno-honor-nans' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-rounding-math -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fno-rounding-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNa %s // WARNa: warning: overriding '-ffp-model=strict' option with '-fno-rounding-math' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-signed-zeros -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fno-signed-zeros -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNb %s // WARNb: warning: overriding '-ffp-model=strict' option with '-fno-signed-zeros' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-trapping-math -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -fno-trapping-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNc %s // WARNc: warning: overriding '-ffp-model=strict' option with '-fno-trapping-math' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -freciprocal-math -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -freciprocal-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNd %s // WARNd: warning: overriding '-ffp-model=strict' option with '-freciprocal-math' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -funsafe-math-optimizations -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -funsafe-math-optimizations -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNe %s // WARNe: warning: overriding '-ffp-model=strict' option with '-funsafe-math-optimizations' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -ffp-model=strict -Ofast -c %s 2>&1 \ +// RUN: %clang -### -ffp-model=strict -Ofast -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=WARNf %s // WARNf: warning: overriding '-ffp-model=strict' option with '-Ofast' [-Woverriding-t-option] -// RUN: %clang -target x86_64 -### -c %s 2>&1 \ +// RUN: %clang -### -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-NOROUND %s // CHECK-NOROUND: "-cc1" // CHECK-NOROUND: "-fno-rounding-math" -// RUN: %clang -target x86_64 -### -frounding-math -c %s 2>&1 \ +// RUN: %clang -### -frounding-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-ROUND --implicit-check-not ffp-exception-behavior=strict %s // CHECK-ROUND: "-cc1" // CHECK-ROUND: "-frounding-math" -// RUN: %clang -target x86_64 -### -ftrapping-math -c %s 2>&1 \ +// RUN: %clang -### -ftrapping-math -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-TRAP %s // CHECK-TRAP: "-cc1" // CHECK-TRAP: "-ftrapping-math" // CHECK-TRAP: "-ffp-exception-behavior=strict" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=fast -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-model=fast -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FPM-FAST %s // CHECK-FPM-FAST: "-cc1" // CHECK-FPM-FAST: "-menable-no-infs" @@ -95,41 +93,41 @@ // CHECK-FPM-FAST: "-ffast-math" // CHECK-FPM-FAST: "-ffinite-math-only" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=precise -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-model=precise -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FPM-PRECISE %s // CHECK-FPM-PRECISE: "-cc1" -// CHECK-FPM-PRECISE: "-ffp-contract=on" +// CHECK-FPM-PRECISE: "-ffp-contract=fast" // CHECK-FPM-PRECISE: "-fno-rounding-math" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=strict -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-model=strict -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FPM-STRICT %s // CHECK-FPM-STRICT: "-cc1" // CHECK-FPM-STRICT: "-ftrapping-math" -// CHECK-FPM-STRICT: "-ffp-contract=off" // CHECK-FPM-STRICT: "-frounding-math" // CHECK-FPM-STRICT: "-ffp-exception-behavior=strict" -// RUN: %clang -target x86_64 -### -nostdinc -ftrapping-math -ffp-exception-behavior=ignore -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ftrapping-math -ffp-exception-behavior=ignore -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-TRAP-IGNORE %s // CHECK-TRAP-IGNORE: "-cc1" // CHECK-TRAP-IGNORE: "-fno-rounding-math" // CHECK-TRAP-IGNORE: "-ffp-exception-behavior=ignore" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FEB-STRICT %s // CHECK-FEB-STRICT: "-cc1" // CHECK-FEB-STRICT: "-fno-rounding-math" // CHECK-FEB-STRICT: "-ffp-exception-behavior=strict" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=maytrap -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-exception-behavior=maytrap -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FEB-MAYTRAP %s // CHECK-FEB-MAYTRAP: "-cc1" // CHECK-FEB-MAYTRAP: "-fno-rounding-math" // CHECK-FEB-MAYTRAP: "-ffp-exception-behavior=maytrap" -// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=ignore -c %s 2>&1 \ +// RUN: %clang -### -nostdinc -ffp-exception-behavior=ignore -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-FEB-IGNORE %s // CHECK-FEB-IGNORE: "-cc1" // CHECK-FEB-IGNORE: "-fno-rounding-math" // CHECK-FEB-IGNORE: "-ffp-exception-behavior=ignore" + diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index 51227550b528d..ec539522c25dc 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -769,6 +769,21 @@ // CHECK-FEDORA-21-AARCH64: "{{.*}}/usr/lib/gcc/aarch64-redhat-linux/4.9.0{{/|\\\\}}crtend.o" // CHECK-FEDORA-21-AARCH64: "{{.*}}/usr/lib/gcc/aarch64-redhat-linux/4.9.0/../../../../lib64{{/|\\\\}}crtn.o" // +// Check Fedora 31 on riscv64. +// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ +// RUN: --target=riscv64-redhat-linux -rtlib=platform \ +// RUN: --gcc-toolchain="" \ +// RUN: --sysroot=%S/Inputs/fedora_31_riscv64_tree \ +// RUN: | FileCheck --check-prefix=CHECK-FEDORA-31-RISCV64 %s +// CHECK-FEDORA-31-RISCV64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" +// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9/../../../../lib64{{/|\\\\}}crt1.o" +// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crti.o" +// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtbegin.o" +// CHECK-FEDORA-31-RISCV64: "-L[[SYSROOT]]/usr/lib/gcc/riscv64-redhat-linux/9" +// CHECK-FEDORA-31-RISCV64: "-L[[SYSROOT]]/usr/lib/gcc/riscv64-redhat-linux/9/../../../../lib64" +// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtend.o" +// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtn.o" +// // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: --target=arm-unknown-linux-gnueabi -rtlib=platform \ // RUN: --gcc-toolchain="" \ diff --git a/clang/test/OpenMP/declare_variant_ast_print.c b/clang/test/OpenMP/declare_variant_ast_print.c index 0173626a79085..515d3167627c4 100644 --- a/clang/test/OpenMP/declare_variant_ast_print.c +++ b/clang/test/OpenMP/declare_variant_ast_print.c @@ -8,7 +8,7 @@ int foo(void); #pragma omp declare variant(foo) match(xxx={}, yyy={ccc}) #pragma omp declare variant(foo) match(xxx={vvv}) -#pragma omp declare variant(foo) match(implementation={vendor(llvm)}, device={kind(fpga)}) +#pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)}, device={kind(fpga)}) #pragma omp declare variant(foo) match(implementation={vendor(llvm), xxx}) #pragma omp declare variant(foo) match(implementation={vendor(unknown)}, device={kind(gpu)}) #pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm, xxx, ibm)}, device={kind(cpu, nohost)}) @@ -19,8 +19,8 @@ int bar(void); // CHECK: int foo(); // CHECK-NEXT: #pragma omp declare variant(foo) match(device={kind(nohost)}) // CHECK-NEXT: #pragma omp declare variant(foo) match(device={kind(host)}) -// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(5):ibm, xxx)},device={kind(cpu, nohost)}) -// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):unknown)},device={kind(gpu)}) -// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)}) -// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)},device={kind(fpga)}) +// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm)}, device={kind(cpu, nohost)}) +// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(unknown)}, device={kind(gpu)}) +// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(llvm)}) +// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0): llvm)}, device={kind(fpga)}) // CHECK-NEXT: int bar(); diff --git a/clang/test/OpenMP/declare_variant_ast_print.cpp b/clang/test/OpenMP/declare_variant_ast_print.cpp index 4964c692166fa..fdc6d18ca1340 100644 --- a/clang/test/OpenMP/declare_variant_ast_print.cpp +++ b/clang/test/OpenMP/declare_variant_ast_print.cpp @@ -17,36 +17,40 @@ T foofoo() { return T(); } // CHECK-NEXT: return int(); // CHECK-NEXT: } -// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(5):ibm)},device={kind(fpga)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):unknown)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):llvm)},device={kind(cpu)}) +// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(5): ibm)}, device={kind(fpga)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(unknown)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0): llvm)}, device={kind(cpu)}) // CHECK-NEXT: int bar(); #pragma omp declare variant(foofoo ) match(xxx = {}) #pragma omp declare variant(foofoo ) match(xxx = {vvv}) -#pragma omp declare variant(foofoo ) match(implementation={vendor(llvm), xxx}, device={kind(cpu)}) -#pragma omp declare variant(foofoo ) match(implementation={vendor(unknown)}) -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm)}, device={kind(fpga)}) +#pragma omp declare variant(foofoo ) match(implementation = {vendor(score(0): "llvm"), xxx}, device = {kind(cpu)}) +#pragma omp declare variant(foofoo ) match(implementation = {vendor("unknown")}) +#pragma omp declare variant(foofoo ) match(implementation = {vendor(score(5): ibm)}, device = {kind(fpga)}) int bar(); -// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(C + 5):ibm, xxx)},device={kind(cpu, host)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):unknown)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):llvm)},device={kind(cpu)}) +// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(C + 5): ibm)}, device={kind(cpu, host)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(unknown)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(llvm)}, device={kind(cpu)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(user={condition(false)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(user={condition(true)}) // CHECK-NEXT: template T barbar(); #pragma omp declare variant(foofoo ) match(xxx = {}) #pragma omp declare variant(foofoo ) match(xxx = {vvv}) -#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) -#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) -#pragma omp declare variant(foofoo ) match(user = {condition()}) -#pragma omp declare variant(foofoo ) match(user = {condition()}) -#pragma omp declare variant(foofoo ) match(implementation={vendor(llvm)},device={kind(cpu)}) +#pragma omp declare variant(foofoo ) match(user = {score(1 * 1 + 1) : condition(100 > 10 + 2)}) +#pragma omp declare variant(foofoo ) match(user = {score(0) : condition(0)}) +#pragma omp declare variant(foofoo ) match(user = {condition(true)}) +#pragma omp declare variant(foofoo ) match(user = {condition(false)}) +#pragma omp declare variant(foofoo ) match(implementation = {vendor(llvm)}, device = {kind(cpu)}) #pragma omp declare variant(foofoo ) match(implementation={vendor(unknown)}) #pragma omp declare variant(foofoo ) match(implementation={vendor(score(C+5): ibm, xxx, ibm)},device={kind(cpu,host)}) template T barbar(); -// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(3 + 5):ibm, xxx)},device={kind(cpu, host)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):unknown)}) -// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(score(0):llvm)},device={kind(cpu)}) +// CHECK: #pragma omp declare variant(foofoo) match(implementation={vendor(score(3 + 5): ibm)}, device={kind(cpu, host)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(unknown)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(implementation={vendor(llvm)}, device={kind(cpu)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(user={condition(false)}) +// CHECK-NEXT: #pragma omp declare variant(foofoo) match(user={condition(true)}) // CHECK-NEXT: template<> int barbar(); // CHECK-NEXT: int baz() { @@ -66,19 +70,19 @@ template void h_ref(C *hp, C *hp2, C *hq, C *lin) { } -// CHECK: #pragma omp declare variant(h_ref) match(implementation={vendor(score(0):unknown)},device={kind(nohost)}) -// CHECK-NEXT: #pragma omp declare variant(h_ref) match(implementation={vendor(score(0):llvm)},device={kind(gpu)}) +// CHECK: #pragma omp declare variant(h_ref) match(implementation={vendor(unknown)}, device={kind(nohost)}) +// CHECK-NEXT: #pragma omp declare variant(h_ref) match(implementation={vendor(llvm)}, device={kind(gpu)}) // CHECK-NEXT: template void h(C *hp, C *hp2, C *hq, C *lin) { // CHECK-NEXT: } #pragma omp declare variant(h_ref ) match(xxx = {}) -#pragma omp declare variant(h_ref ) match(implementation={vendor(llvm)}, device={kind(gpu)}) -#pragma omp declare variant(h_ref ) match(implementation={vendor(unknown)},device={kind(nohost)}) +#pragma omp declare variant(h_ref ) match(implementation = {vendor(llvm)}, device = {kind(gpu)}) +#pragma omp declare variant(h_ref ) match(implementation = {vendor(unknown)}, device = {kind(nohost)}) template void h(C *hp, C *hp2, C *hq, C *lin) { } -// CHECK: #pragma omp declare variant(h_ref) match(implementation={vendor(score(0):unknown)},device={kind(nohost)}) -// CHECK-NEXT: #pragma omp declare variant(h_ref) match(implementation={vendor(score(0):llvm)},device={kind(gpu)}) +// CHECK: #pragma omp declare variant(h_ref) match(implementation={vendor(unknown)}, device={kind(nohost)}) +// CHECK-NEXT: #pragma omp declare variant(h_ref) match(implementation={vendor(llvm)}, device={kind(gpu)}) // CHECK-NEXT: template<> void h(float *hp, float *hp2, float *hq, float *lin) { // CHECK-NEXT: } @@ -86,7 +90,7 @@ void h(C *hp, C *hp2, C *hq, C *lin) { // CHECK-NEXT: h((float *)hp, (float *)hp2, (float *)hq, (float *)lin); // CHECK-NEXT: } #pragma omp declare variant(h_ref ) match(xxx = {}) -#pragma omp declare variant(h_ref ) match(implementation={vendor(ibm)},device={kind(cpu,gpu)}) +#pragma omp declare variant(h_ref ) match(implementation = {vendor(ibm)}, device = {kind(cpu, gpu)}) #pragma omp declare variant(h_ref ) match(implementation={vendor(unknown)}) template <> void h(double *hp, double *hp2, double *hq, double *lin) { @@ -97,36 +101,36 @@ void h(double *hp, double *hp2, double *hq, double *lin) { int fn(); // CHECK: int fn(int); int fn(int); -// CHECK: #pragma omp declare variant(fn) match(implementation={vendor(score(0):unknown)},device={kind(cpu, gpu)}) -// CHECK-NEXT: #pragma omp declare variant(fn) match(implementation={vendor(score(0):llvm)}) +// CHECK: #pragma omp declare variant(fn) match(implementation={vendor(unknown)}, device={kind(cpu, gpu)}) +// CHECK-NEXT: #pragma omp declare variant(fn) match(implementation={vendor(llvm)}) // CHECK-NEXT: int overload(); #pragma omp declare variant(fn) match(xxx = {}) #pragma omp declare variant(fn) match(implementation={vendor(llvm)}) -#pragma omp declare variant(fn) match(implementation={vendor(unknown)},device={kind(cpu,gpu)}) +#pragma omp declare variant(fn) match(implementation = {vendor(unknown)}, device = {kind(cpu, gpu)}) int overload(void); // CHECK: int fn_deduced_variant() { // CHECK-NEXT: return 0; // CHECK-NEXT: } auto fn_deduced_variant() { return 0; } -// CHECK: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(score(0):unknown)},device={kind(gpu, nohost)}) -// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(score(0):llvm)},device={kind(cpu, host)}) +// CHECK: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(unknown)}, device={kind(gpu, nohost)}) +// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(llvm)}, device={kind(cpu, host)}) // CHECK-NEXT: int fn_deduced(); #pragma omp declare variant(fn_deduced_variant) match(xxx = {}) -#pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(llvm)},device={kind(cpu,host)}) -#pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(unknown)},device={kind(gpu,nohost)}) +#pragma omp declare variant(fn_deduced_variant) match(implementation = {vendor(llvm)}, device = {kind(cpu, host)}) +#pragma omp declare variant(fn_deduced_variant) match(implementation = {vendor(unknown)}, device = {kind(gpu, nohost)}) int fn_deduced(); // CHECK: int fn_deduced_variant1(); int fn_deduced_variant1(); -// CHECK: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)}) -// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(score(0):ibm)},device={kind(gpu, nohost)}) +// CHECK: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(unknown)}, device={kind(cpu, host)}) +// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(ibm)}, device={kind(gpu, nohost)}) // CHECK-NEXT: int fn_deduced1() { // CHECK-NEXT: return 0; // CHECK-NEXT: } #pragma omp declare variant(fn_deduced_variant1) match(xxx = {}) -#pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(ibm)},device={kind(gpu,nohost)}) -#pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(unknown)},device={kind(cpu,host)}) +#pragma omp declare variant(fn_deduced_variant1) match(implementation = {vendor(ibm)}, device = {kind(gpu, nohost)}) +#pragma omp declare variant(fn_deduced_variant1) match(implementation = {vendor(unknown)}, device = {kind(cpu, host)}) auto fn_deduced1() { return 0; } // CHECK: struct SpecialFuncs { @@ -140,11 +144,11 @@ auto fn_deduced1() { return 0; } // CHECK-NEXT: } // CHECK-NEXT: void bar(int) { // CHECK-NEXT: } -// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(nohost)}) -// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(score(0):ibm)},device={kind(cpu)}) +// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(nohost)}) +// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(ibm)}, device={kind(cpu)}) // CHECK-NEXT: void foo1() { // CHECK-NEXT: } -// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)}) +// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(cpu, host)}) // CHECK-NEXT: void xxx(); // CHECK-NEXT: } s; struct SpecialFuncs { @@ -157,14 +161,14 @@ struct SpecialFuncs { void bar(int) {} #pragma omp declare variant(SpecialFuncs::baz) match(xxx = {}) #pragma omp declare variant(SpecialFuncs::bar) match(xxx = {}) -#pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(ibm)},device={kind(cpu)}) -#pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)},device={kind(nohost)}) +#pragma omp declare variant(SpecialFuncs::bar) match(implementation = {vendor(ibm)}, device = {kind(cpu)}) +#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {vendor(unknown)}, device = {kind(nohost)}) void foo1() {} -#pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)},device={kind(cpu, host)}) +#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {vendor(unknown)}, device = {kind(cpu, host)}) void xxx(); } s; -// CHECK: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)}) +// CHECK: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(cpu, host)}) // CHECK-NEXT: void SpecialFuncs::xxx() { // CHECK-NEXT: } void SpecialFuncs::xxx() {} @@ -172,12 +176,12 @@ void SpecialFuncs::xxx() {} // CHECK: static void static_f_variant() { // CHECK-NEXT: } static void static_f_variant() {} -// CHECK: #pragma omp declare variant(static_f_variant) match(implementation={vendor(score(0):unknown)}) -// CHECK-NEXT: #pragma omp declare variant(static_f_variant) match(implementation={vendor(score(0):llvm)},device={kind(fpga)}) +// CHECK: #pragma omp declare variant(static_f_variant) match(implementation={vendor(unknown)}) +// CHECK-NEXT: #pragma omp declare variant(static_f_variant) match(implementation={vendor(llvm)}, device={kind(fpga)}) // CHECK-NEXT: static void static_f() { // CHECK-NEXT: } #pragma omp declare variant(static_f_variant) match(xxx = {}) -#pragma omp declare variant(static_f_variant) match(implementation={vendor(llvm)},device={kind(fpga)}) +#pragma omp declare variant(static_f_variant) match(implementation = {vendor(llvm)}, device = {kind(fpga)}) #pragma omp declare variant(static_f_variant) match(implementation={vendor(unknown)}) static void static_f() {} @@ -192,19 +196,19 @@ void bazzzz() { // CHECK: int fn_linkage_variant(); // CHECK: extern "C" { -// CHECK: #pragma omp declare variant(fn_linkage_variant) match(implementation={vendor(score(0):xxx)},device={kind(cpu, host)}) +// CHECK: #pragma omp declare variant(fn_linkage_variant) match(implementation={vendor(ti)}, device={kind(cpu, host)}) // CHECK: int fn_linkage(); // CHECK: } int fn_linkage_variant(); extern "C" { -#pragma omp declare variant(fn_linkage_variant) match(implementation = {vendor(xxx)},device={kind(cpu,host)}) +#pragma omp declare variant(fn_linkage_variant) match(implementation = {vendor(ti)}, device = {kind(cpu, host)}) int fn_linkage(); } // CHECK: extern "C" int fn_linkage_variant1() -// CHECK: #pragma omp declare variant(fn_linkage_variant1) match(implementation={vendor(score(0):xxx)},device={kind(cpu, host)}) +// CHECK: #pragma omp declare variant(fn_linkage_variant1) match(implementation={vendor(gnu)}, device={kind(cpu, host)}) // CHECK: int fn_linkage1(); extern "C" int fn_linkage_variant1(); -#pragma omp declare variant(fn_linkage_variant1) match(implementation = {vendor(xxx)},device={kind(cpu,host)}) +#pragma omp declare variant(fn_linkage_variant1) match(implementation = {vendor(gnu)}, device = {kind(cpu, host)}) int fn_linkage1(); diff --git a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp index 225990d62fc3b..55195ffd43b26 100644 --- a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp +++ b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp @@ -71,18 +71,18 @@ #pragma omp declare target #ifdef HOST -#define CORRECT host -#define SUBSET host, cpu +#define SUBSET host +#define CORRECT host, cpu #define WRONG host, nohost #endif // HOST #ifdef CPU -#define CORRECT cpu -#define SUBSET host, cpu +#define SUBSET cpu +#define CORRECT cpu, any #define WRONG cpu, gpu #endif // CPU #ifdef NOHOST -#define CORRECT nohost -#define SUBSET nohost, cpu +#define SUBSET nohost +#define CORRECT nohost, cpu #define WRONG nohost, host #endif // NOHOST diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c index 26507629ea370..7b87e696152bd 100644 --- a/clang/test/OpenMP/declare_variant_messages.c +++ b/clang/test/OpenMP/declare_variant_messages.c @@ -2,95 +2,102 @@ // RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp-simd -x c -std=c99 -fms-extensions -Wno-pragma-pack %s -// expected-error@+1 {{expected an OpenMP directive}} -#pragma omp declare + +#pragma omp declare // expected-error {{expected an OpenMP directive}} int foo(void); #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}} -#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} +#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} #pragma omp declare variant(foo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}} #pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} #pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} #pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} #pragma omp declare variant(foo) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}} #pragma omp declare variant(foo) match // expected-error {{expected '(' after 'match'}} -#pragma omp declare variant(foo) match( // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foo) match() // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foo) match(xxx) // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} -#pragma omp declare variant(foo) match(xxx=) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foo) match(xxx=yyy) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foo) match(xxx=yyy}) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foo) match(xxx={) // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}} -#pragma omp declare variant(foo) match(xxx={}) -#pragma omp declare variant(foo) match(xxx={vvv, vvv}) -#pragma omp declare variant(foo) match(xxx={vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}} -#pragma omp declare variant(foo) match(xxx={vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}} -#pragma omp declare variant(foo) match(implementation={xxx}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foo) match(implementation={vendor}) // expected-error {{expected '(' after 'vendor'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foo) match(implementation={vendor(}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foo) match(implementation={vendor()}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} -#pragma omp declare variant(foo) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}} -#pragma omp declare variant(foo) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}} -#pragma omp declare variant(foo) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}} -#pragma omp declare variant(foo) match(implementation={vendor(score(foo()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integer constant expression}} -#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}} -#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foo) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foo) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foo) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} -#pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foo) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foo) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}} +#pragma omp declare variant(foo) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx=) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx=yyy) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx=yyy}) // expected-error {{expected ')'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(xxx={) // expected-error {{expected ')'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx={vvv, vvv}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx={vvv} xxx) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(xxx={vvv}) xxx // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foo) match(implementation={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(implementation={vendor}) // expected-warning {{the context selector 'vendor' in context set 'implementation' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(implementation={vendor(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(implementation={vendor()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} +#pragma omp declare variant(foo) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foo) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(implementation={vendor(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} +#pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foo) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}} +#pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foo) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}} int bar(void); -// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}} -#pragma omp declare variant(foo) match(xxx={}) -int a; -// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}} -#pragma omp declare variant(foo) match(xxx={}) -#pragma omp threadprivate(a) +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score() :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} +#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}} +#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}} +#pragma omp declare variant(foo) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +int score_and_cond_non_const(); + +#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +int a; // expected-error {{'#pragma omp declare variant' can only be applied to functions}} + +#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp threadprivate(a) // expected-error {{'#pragma omp declare variant' can only be applied to functions}} int var; #pragma omp threadprivate(var) -// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(foo) match(xxx={}) -#pragma omp declare -// expected-error@+3 {{function declaration is expected after 'declare variant' directive}} -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(foo) match(xxx={}) -#pragma omp declare variant(foo) match(xxx={}) +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare // expected-error {{expected an OpenMP directive}} + + + +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}} #pragma options align=packed int main(); -// expected-error@+3 {{function declaration is expected after 'declare variant' directive}} -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(foo) match(xxx={}) -#pragma omp declare variant(foo) match(xxx={}) + + +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}} #pragma init_seg(compiler) int main(); -// expected-error@+1 {{single declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(foo) match(xxx={}) + +#pragma omp declare variant(foo) match(xxx={}) // expected-error {{single declaration is expected after 'declare variant' directive}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int b, c; int no_proto(); -#pragma omp declare variant(no_proto) match(xxx={}) +#pragma omp declare variant(no_proto) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int no_proto_too(); int proto1(int); -// expected-note@+2 {{previous declaration is here}} -#pragma omp declare variant(proto1) match(xxx={}) -int diff_proto(); -// expected-error@+1 {{conflicting types for 'diff_proto'}} -int diff_proto(double); -#pragma omp declare variant(no_proto) match(xxx={}) +#pragma omp declare variant(proto1) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +int diff_proto(); // expected-note {{previous declaration is here}} + +int diff_proto(double); // expected-error {{conflicting types for 'diff_proto'}} + +#pragma omp declare variant(no_proto) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int diff_proto1(double); int after_use_variant(void); @@ -99,37 +106,37 @@ int bar() { return after_use(); } -// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} -#pragma omp declare variant(after_use_variant) match(xxx={}) + +#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int after_use(void); -#pragma omp declare variant(after_use_variant) match(xxx={}) +#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int defined(void) { return 0; } int defined1(void) { return 0; } -// expected-warning@+1 {{#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} -#pragma omp declare variant(after_use_variant) match(xxx={}) + +#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int defined1(void); int diff_cc_variant(void); -// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'int (void) __attribute__((vectorcall))'}} -#pragma omp declare variant(diff_cc_variant) match(xxx={}) + +#pragma omp declare variant(diff_cc_variant) match(xxx={}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'int (void) __attribute__((vectorcall))'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} __vectorcall int diff_cc(void); int diff_ret_variant(void); -// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'void (void)'}} -#pragma omp declare variant(diff_ret_variant) match(xxx={}) + +#pragma omp declare variant(diff_ret_variant) match(xxx={}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'void (void)'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} void diff_ret(void); void marked(void); void not_marked(void); -// expected-note@+1 {{marked as 'declare variant' here}} -#pragma omp declare variant(not_marked) match(implementation={vendor(unknown)}, device={kind(cpu)}) + +#pragma omp declare variant(not_marked) match(implementation={vendor(unknown)}, device={kind(cpu)}) // expected-note {{marked as 'declare variant' here}} void marked_variant(void); -// expected-warning@+1 {{variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'}} -#pragma omp declare variant(marked_variant) match(xxx={}) + +#pragma omp declare variant(marked_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} void marked(void); -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant + +#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} + +#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} diff --git a/clang/test/OpenMP/declare_variant_messages.cpp b/clang/test/OpenMP/declare_variant_messages.cpp index ca1e4c33d17ee..f9950a88241c4 100644 --- a/clang/test/OpenMP/declare_variant_messages.cpp +++ b/clang/test/OpenMP/declare_variant_messages.cpp @@ -2,137 +2,180 @@ // RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp-simd -x c++ -std=c++14 -fms-extensions -Wno-pragma-pack -fexceptions -fcxx-exceptions %s -// expected-error@+1 {{expected an OpenMP directive}} -#pragma omp declare + +#pragma omp declare // expected-error {{expected an OpenMP directive}} int foo(); template -T foofoo(); // expected-note 2 {{declared here}} - -#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}} -#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}} -#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} -#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match // expected-error {{expected '(' after 'match'}} -#pragma omp declare variant(foofoo ) match( // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match() // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(xxx) // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(xxx =) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foofoo ) match(xxx = yyy) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foofoo ) match(xxx = yyy }) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foofoo ) match(xxx = {) // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}} -#pragma omp declare variant(foofoo ) match(xxx = {}) -#pragma omp declare variant(foofoo ) match(xxx = {vvv, vvv}) -#pragma omp declare variant(foofoo ) match(xxx = {vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}} -#pragma omp declare variant(foofoo ) match(xxx = {vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}} -#pragma omp declare variant(foofoo ) match(implementation={xxx}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foofoo ) match(implementation={vendor}) // expected-error {{expected '(' after 'vendor'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor()}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(foofoo ()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integral constant expression}} expected-note {{non-constexpr function 'foofoo' cannot be used in a constant expression}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foofoo ) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foofoo ) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(2 gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(foofoo ()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} +T foofoo(); + +#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}} +#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} +#pragma omp declare variant(foo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}} +#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} +#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) match // expected-error {{expected '(' after 'match'}} +#pragma omp declare variant(foofoo ) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foofoo ) match(implementation) // expected-warning {{expected '=' after the context set name "implementation"; '=' assumed}} expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation =) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = yyy) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{'yyy' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = yyy }) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{'yyy' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv, vvv}) // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv} implementation) // expected-error {{expected ')'}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv}) implementation // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={vendor}) // expected-warning {{the context selector 'vendor' in context set 'implementation' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation={vendor()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(foofoo ()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{score expressions in the OpenMP context selector need to be constant; foofoo() is not and will be ignored}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} +#pragma omp declare variant(foofoo ) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foofoo ) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind(score(foofoo ()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foofoo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}} int bar(); -#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}} -#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}} -#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} -#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match // expected-error {{expected '(' after 'match'}} -#pragma omp declare variant(foofoo ) match( // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match() // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(xxx) // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(xxx =) // expected-error {{expected '{' after '='}} -#pragma omp declare variant(foofoo ) match(xxx = {) // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}} -#pragma omp declare variant(foofoo ) match(xxx = {}) -#pragma omp declare variant(foofoo ) match(xxx = {vvv, vvv}) -#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) -#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) -#pragma omp declare variant(foofoo ) match(user = {condition()}) -#pragma omp declare variant(foofoo ) match(user = {condition()}) -#pragma omp declare variant(foofoo ) match(xxx = {vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}} -#pragma omp declare variant(foofoo ) match(xxx = {vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(C ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(foofoo ()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integral constant expression}} expected-note {{non-constexpr function 'foofoo' cannot be used in a constant expression}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(C+5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}} -#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foofoo ) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} -#pragma omp declare variant(foofoo ) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}} -#pragma omp declare variant(foofoo ) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} -#pragma omp declare variant(foofoo ) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(C gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(foofoo ()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(C+5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} -#pragma omp declare variant(foofoo ) match(device={kind(score(C+5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} +#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}} +#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}} +#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} +#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}} +#pragma omp declare variant(foofoo ) match // expected-error {{expected '(' after 'match'}} +#pragma omp declare variant(foofoo ) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +#pragma omp declare variant(foofoo ) match(implementation) // expected-warning {{expected '=' after the context set name "implementation"; '=' assumed}} expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation =) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv, vvv}) // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) // expected-warning {{'score' is not a valid context selector for the context set 'user'; selector ignored}} expected-note {{context selector options are: 'condition'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(user = {score() : condition()}) // expected-warning {{'score' is not a valid context selector for the context set 'user'; selector ignored}} expected-note {{context selector options are: 'condition'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv} implementation) // expected-error {{expected ')'}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation = {vvv}) xxx // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(C ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(foofoo ()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(C+5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} +#pragma omp declare variant(foofoo ) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} +#pragma omp declare variant(foofoo ) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind(score(C gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}} +#pragma omp declare variant(foofoo ) match(device={kind(score(foofoo ()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foofoo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(score(C+5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C + 5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(foofoo ) match(device={kind(score(C+5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C + 5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}} template T barbar(); -// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}} -#pragma omp declare variant(barbar ) match(xxx = {}) -int a; -// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}} -#pragma omp declare variant(barbar ) match(xxx = {}) -#pragma omp threadprivate(a) +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score() :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} +#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}} +#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}} +#pragma omp declare variant(foo) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +int score_and_cond_non_const(); + +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) +#pragma omp declare variant(foo) match(implementation = {vendor(score() :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} +#pragma omp declare variant(foo) match(user = {condition(foo)}) +#pragma omp declare variant(foo) match(user = {condition(foo())}) +#pragma omp declare variant(foo) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +template +int score_and_cond_non_const_no_inst(); + +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}} +#pragma omp declare variant(foo) match(implementation = {vendor(score() :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} +#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}} +#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}} +#pragma omp declare variant(foo) match(user = {condition()}) // expected-error {{expected expression}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}} +template +int score_and_cond_non_const_inst(); + +constexpr int constexpr_fn(int i) { return 7 * i; } +#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(3)) : llvm)}) +#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(1))}) +int score_and_cond_const(); + +#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(3)) : llvm)}) +#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(C)) : llvm)}) +#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(1))}) +#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(C))}) +template +int score_and_cond_const_inst(); + +void score_and_cond_inst() { + score_and_cond_non_const(); + score_and_cond_non_const_inst<8>(); // expected-note {{in instantiation of function template specialization 'score_and_cond_non_const_inst<8>' requested here}} + score_and_cond_const_inst<9>(); +} + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +int a; // expected-error {{'#pragma omp declare variant' can only be applied to functions}} + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp threadprivate(a) // expected-error {{'#pragma omp declare variant' can only be applied to functions}} int var; #pragma omp threadprivate(var) -// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(barbar ) match(xxx = {}) -#pragma omp declare -// expected-error@+3 {{function declaration is expected after 'declare variant' directive}} -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(barbar ) match(xxx = {}) -#pragma omp declare variant(barbar ) match(xxx = {}) +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare // expected-error {{expected an OpenMP directive}} + + + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare variant(barbar ) match(xxx = {}) // expected-error {{function declaration is expected after 'declare variant' directive}} #pragma options align = packed int main(); -// expected-error@+3 {{function declaration is expected after 'declare variant' directive}} -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(barbar ) match(xxx = {}) -#pragma omp declare variant(barbar ) match(xxx = {}) + + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}} +#pragma omp declare variant(barbar ) match(xxx = {}) // expected-error {{function declaration is expected after 'declare variant' directive}} #pragma init_seg(compiler) int main(); -// expected-error@+1 {{single declaration is expected after 'declare variant' directive}} -#pragma omp declare variant(barbar ) match(xxx = {}) + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-error {{single declaration is expected after 'declare variant' directive}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int b, c; -// expected-error@+1 {{'C' does not refer to a value}} -#pragma omp declare variant(C) match(xxx = {}) -// expected-note@+1 {{declared here}} -template + +#pragma omp declare variant(C) match(implementation = {}) // expected-error {{'C' does not refer to a value}} + +template // expected-note {{declared here}} void h(C *hp, C *hp2, C *hq, C *lin) { b = 0; } -// expected-error@+1 {{variant in '#pragma omp declare variant' with type '' is incompatible with type 'void (int *, int *, int *, int *)'}} -#pragma omp declare variant(barbar ) match(xxx = {}) + +#pragma omp declare variant(barbar ) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type '' is incompatible with type 'void (int *, int *, int *, int *)'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} template <> void h(int *hp, int *hp2, int *hq, int *lin); @@ -142,113 +185,113 @@ int bar() { return after_use(); } -// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} -#pragma omp declare variant(after_use_variant) match(xxx = {}) + +#pragma omp declare variant(after_use_variant) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int after_use(void); int fn(); int fn(int); -#pragma omp declare variant(fn) match(xxx = {}) +#pragma omp declare variant(fn) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int overload(void); int fn1(); int fn1(int); -// expected-error@+1 {{variant in '#pragma omp declare variant' with type '' is incompatible with type 'int (float)'}} -#pragma omp declare variant(fn1) match(xxx = {}) + +#pragma omp declare variant(fn1) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type '' is incompatible with type 'int (float)'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int overload1(float); int fn_constexpr_variant(); -// expected-error@+2 {{'#pragma omp declare variant' does not support constexpr functions}} -#pragma omp declare variant(fn_constexpr_variant) match(xxx = {}) -constexpr int fn_constexpr(); + +#pragma omp declare variant(fn_constexpr_variant) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +constexpr int fn_constexpr(); // expected-error {{'#pragma omp declare variant' does not support constexpr functions}} constexpr int fn_constexpr_variant1(); -// expected-error@+1 {{'#pragma omp declare variant' does not support constexpr functions}} -#pragma omp declare variant(fn_constexpr_variant1) match(xxx = {}) + +#pragma omp declare variant(fn_constexpr_variant1) match(implementation = {}) // expected-error {{'#pragma omp declare variant' does not support constexpr functions}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int fn_constexpr1(); int fn_sc_variant(); -// expected-error@+1 {{function with '#pragma omp declare variant' has a different storage class}} -#pragma omp declare variant(fn_sc_variant) match(xxx = {}) + +#pragma omp declare variant(fn_sc_variant) match(xxx = {}) // expected-error {{function with '#pragma omp declare variant' has a different storage class}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} static int fn_sc(); static int fn_sc_variant1(); -// expected-error@+1 {{function with '#pragma omp declare variant' has a different storage class}} -#pragma omp declare variant(fn_sc_variant1) match(xxx = {}) + +#pragma omp declare variant(fn_sc_variant1) match(implementation = {}) // expected-error {{function with '#pragma omp declare variant' has a different storage class}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int fn_sc1(); int fn_inline_variant(); -// expected-error@+1 {{function with '#pragma omp declare variant' has a different inline specification}} -#pragma omp declare variant(fn_inline_variant) match(xxx = {}) + +#pragma omp declare variant(fn_inline_variant) match(xxx = {}) // expected-error {{function with '#pragma omp declare variant' has a different inline specification}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} inline int fn_inline(); inline int fn_inline_variant1(); -// expected-error@+1 {{function with '#pragma omp declare variant' has a different inline specification}} -#pragma omp declare variant(fn_inline_variant1) match(xxx = {}) + +#pragma omp declare variant(fn_inline_variant1) match(implementation = {}) // expected-error {{function with '#pragma omp declare variant' has a different inline specification}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} int fn_inline1(); auto fn_deduced_variant() { return 0; } -#pragma omp declare variant(fn_deduced_variant) match(xxx = {}) +#pragma omp declare variant(fn_deduced_variant) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} int fn_deduced(); int fn_deduced_variant1(); -#pragma omp declare variant(fn_deduced_variant1) match(xxx = {}) +#pragma omp declare variant(fn_deduced_variant1) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} auto fn_deduced1() { return 0; } auto fn_deduced3() { return 0; } -// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} -#pragma omp declare variant(fn_deduced_variant1) match(xxx = {}) + +#pragma omp declare variant(fn_deduced_variant1) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} auto fn_deduced3(); auto fn_deduced_variant2() { return 0; } -// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int ()' is incompatible with type 'float ()'}} -#pragma omp declare variant(fn_deduced_variant2) match(xxx = {}) + +#pragma omp declare variant(fn_deduced_variant2) match(xxx = {}) // expected-error {{variant in '#pragma omp declare variant' with type 'int ()' is incompatible with type 'float ()'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} float fn_deduced2(); -// expected-error@+1 {{exception specification in declaration does not match previous declaration}} -int fn_except_variant() noexcept(true); -// expected-note@+2 {{previous declaration is here}} -#pragma omp declare variant(fn_except_variant) match(xxx = {}) -int fn_except() noexcept(false); -// expected-error@+1 {{exception specification in declaration does not match previous declaration}} -int fn_except_variant1() noexcept(false); -// expected-note@+2 {{previous declaration is here}} -#pragma omp declare variant(fn_except_variant1) match(xxx = {}) -int fn_except1() noexcept(true); +int fn_except_variant() noexcept(true); // expected-error {{exception specification in declaration does not match previous declaration}} + +#pragma omp declare variant(fn_except_variant) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +int fn_except() noexcept(false); // expected-note {{previous declaration is here}} + + +int fn_except_variant1() noexcept(false); // expected-error {{exception specification in declaration does not match previous declaration}} + +#pragma omp declare variant(fn_except_variant1) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} +int fn_except1() noexcept(true); // expected-note {{previous declaration is here}} struct SpecialFuncs { void vd(); - // expected-error@+2 {{'#pragma omp declare variant' does not support constructors}} -#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {}) - SpecialFuncs(); - // expected-error@+2 {{'#pragma omp declare variant' does not support destructors}} -#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {}) - ~SpecialFuncs(); + +#pragma omp declare variant(SpecialFuncs::vd) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} + SpecialFuncs(); // expected-error {{'#pragma omp declare variant' does not support constructors}} + +#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} + ~SpecialFuncs(); // expected-error {{'#pragma omp declare variant' does not support destructors}} void baz(); void bar(); void bar(int); -#pragma omp declare variant(SpecialFuncs::baz) match(xxx = {}) -#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {}) -// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (*)()' is incompatible with type 'void (SpecialFuncs::*)()'}} -#pragma omp declare variant(fn_sc_variant1) match(xxx = {}) +#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} +#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} + +#pragma omp declare variant(fn_sc_variant1) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (*)()' is incompatible with type 'void (SpecialFuncs::*)()'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} void foo1(); SpecialFuncs& foo(const SpecialFuncs&); SpecialFuncs& bar(SpecialFuncs&&); - // expected-error@+2 {{'#pragma omp declare variant' does not support defaulted functions}} -#pragma omp declare variant(SpecialFuncs::foo) match(xxx = {}) - SpecialFuncs& operator=(const SpecialFuncs&) = default; - // expected-error@+2 {{'#pragma omp declare variant' does not support deleted functions}} -#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {}) - SpecialFuncs& operator=(SpecialFuncs&&) = delete; + +#pragma omp declare variant(SpecialFuncs::foo) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} + SpecialFuncs& operator=(const SpecialFuncs&) = default; // expected-error {{'#pragma omp declare variant' does not support defaulted functions}} + +#pragma omp declare variant(SpecialFuncs::bar) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} + SpecialFuncs& operator=(SpecialFuncs&&) = delete; // expected-error {{'#pragma omp declare variant' does not support deleted functions}} }; namespace N { -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant + +#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} } // namespace N -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant -// expected-error@+1 {{function declaration is expected after 'declare variant' directive}} -#pragma omp declare variant + +#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} + +#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp index 0c13f5f2f1205..4609a4f77728f 100644 --- a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp +++ b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp @@ -49,7 +49,7 @@ int call() { return 1; } static int stat_unused_no_emit() { return 1; } static int stat_unused_(); #pragma omp declare variant(stat_unused_) match(implementation = {vendor(llvm)}, device={kind(cpu)}) -#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(xxx)}, device={kind(gpu)}) +#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(unknown)}, device = {kind(gpu)}) static int stat_unused() { return 1; } static int stat_used_(); @@ -103,16 +103,16 @@ void xxx() { int prio() { return 81; } int prio1() { return 82; } -#pragma omp declare variant(prio) match(implementation = {vendor(score(2): llvm)}, device={kind(cpu,host)}) -#pragma omp declare variant(prio1) match(implementation = {vendor(score(1): llvm)}, device={kind(cpu)}) +#pragma omp declare variant(prio1) match(implementation = {vendor(score(2): llvm)}, device={kind(cpu,host)}) +#pragma omp declare variant(prio) match(implementation = {vendor(score(1): llvm)}, device={kind(cpu)}) int prio_() { return 1; } static int prio2() { return 83; } static int prio3() { return 84; } static int prio4() { return 84; } -#pragma omp declare variant(prio4) match(implementation = {vendor(score(8): llvm)},device={kind(cpu,host)}) -#pragma omp declare variant(prio2) match(implementation = {vendor(score(5): llvm)}) +#pragma omp declare variant(prio4) match(implementation = {vendor(score(5): llvm)}) +#pragma omp declare variant(prio2) match(implementation = {vendor(score(8): llvm)}, device={kind(cpu,host)}) #pragma omp declare variant(prio3) match(implementation = {vendor(score(7): llvm)}, device={kind(cpu)}) static int prio1_() { return 1; } @@ -137,7 +137,7 @@ int fn_variant2() { return 1; } #pragma omp declare variant(fn_variant2) match(implementation = {vendor(llvm)}, device={kind(fpga)}) int fn2() { return 87; } -#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(xxx)}, device={kind(gpu)}) +#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(unknown)}, device = {kind(gpu)}) template static T stat_unused_T() { return 88; } diff --git a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp index 7f84709b80d42..a9ed8f7486822 100644 --- a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp @@ -43,13 +43,13 @@ #define HEADER #ifdef GPU -#define CORRECT gpu -#define SUBSET nohost, gpu +#define SUBSET gpu +#define CORRECT nohost, gpu #define WRONG cpu, gpu #endif // GPU #ifdef NOHOST -#define CORRECT nohost -#define SUBSET nohost, gpu +#define SUBSET nohost +#define CORRECT nohost, gpu #define WRONG nohost, host #endif // NOHOST diff --git a/clang/test/Preprocessor/Weverything_pragma.c b/clang/test/Preprocessor/Weverything_pragma.c index 1815f554fffd6..f2cf97ed4a1ca 100644 --- a/clang/test/Preprocessor/Weverything_pragma.c +++ b/clang/test/Preprocessor/Weverything_pragma.c @@ -6,7 +6,7 @@ // but -Weverything forces it #define UNUSED_MACRO1 1 // expected-warning{{macro is not used}} -void foo() // expected-warning {{no previous prototype for function}} +void foo(void) // expected-warning {{no previous prototype for function}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} { // A diagnostic without DefaultIgnore, and not part of a group. diff --git a/clang/test/Preprocessor/pragma_diagnostic.c b/clang/test/Preprocessor/pragma_diagnostic.c index 99724623207f1..75d2bbc7190f3 100644 --- a/clang/test/Preprocessor/pragma_diagnostic.c +++ b/clang/test/Preprocessor/pragma_diagnostic.c @@ -35,19 +35,19 @@ #endif // Testing pragma clang diagnostic with -Weverything -void ppo(){} // First test that we do not diagnose on this. +void ppo(void){} // First test that we do not diagnose on this. #pragma clang diagnostic warning "-Weverything" -void ppp(){} // expected-warning {{no previous prototype for function 'ppp'}} +void ppp(void){} // expected-warning {{no previous prototype for function 'ppp'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic ignored "-Weverything" // Reset it. -void ppq(){} +void ppq(void){} #pragma clang diagnostic error "-Weverything" // Now set to error -void ppr(){} // expected-error {{no previous prototype for function 'ppr'}} +void ppr(void){} // expected-error {{no previous prototype for function 'ppr'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic warning "-Weverything" // This should not be effective -void pps(){} // expected-error {{no previous prototype for function 'pps'}} +void pps(void){} // expected-error {{no previous prototype for function 'pps'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} diff --git a/clang/test/Preprocessor/pushable-diagnostics.c b/clang/test/Preprocessor/pushable-diagnostics.c index 4a0dd895a78e4..9eaf87d58f820 100644 --- a/clang/test/Preprocessor/pushable-diagnostics.c +++ b/clang/test/Preprocessor/pushable-diagnostics.c @@ -18,28 +18,28 @@ int c = 'df'; // expected-warning{{multi-character character constant}} // Test -Weverything -void ppo0(){} // first verify that we do not give anything on this +void ppo0(void){} // first verify that we do not give anything on this #pragma clang diagnostic push // now push #pragma clang diagnostic warning "-Weverything" -void ppr1(){} // expected-warning {{no previous prototype for function 'ppr1'}} +void ppr1(void){} // expected-warning {{no previous prototype for function 'ppr1'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic push // push again #pragma clang diagnostic ignored "-Weverything" // Set to ignore in this level. -void pps2(){} +void pps2(void){} #pragma clang diagnostic warning "-Weverything" // Set to warning in this level. -void ppt2(){} // expected-warning {{no previous prototype for function 'ppt2'}} +void ppt2(void){} // expected-warning {{no previous prototype for function 'ppt2'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic error "-Weverything" // Set to error in this level. -void ppt3(){} // expected-error {{no previous prototype for function 'ppt3'}} +void ppt3(void){} // expected-error {{no previous prototype for function 'ppt3'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic pop // pop should go back to warning level -void pps1(){} // expected-warning {{no previous prototype for function 'pps1'}} +void pps1(void){} // expected-warning {{no previous prototype for function 'pps1'}} // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}} #pragma clang diagnostic pop // Another pop should disble it again -void ppu(){} +void ppu(void){} diff --git a/clang/test/Sema/warn-strict-prototypes.c b/clang/test/Sema/warn-strict-prototypes.c index 5565a09060fc2..50b0f7d060f2e 100644 --- a/clang/test/Sema/warn-strict-prototypes.c +++ b/clang/test/Sema/warn-strict-prototypes.c @@ -1,15 +1,18 @@ // RUN: %clang_cc1 -triple i386-pc-unknown -fsyntax-only -Wstrict-prototypes -Wno-implicit-function-declaration -verify %s // RUN: %clang_cc1 -triple i386-pc-unknown -fsyntax-only -Wstrict-prototypes -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s +// function definition with 0 params, no prototype, no preceding declaration. +void foo0() {} // expected-warning {{this old-style function definition is not preceded by a prototype}} + // function declaration with unspecified params void foo1(); // expected-warning {{this function declaration is not a prototype}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:11}:"void" // function declaration with 0 params void foo2(void); -// function definition with 0 params(for both cases), -// valid according to 6.7.5.3/14 -void foo1() {} +// function definition with 0 params, no prototype. +void foo1() {} // expected-warning {{this old-style function definition is not preceded by a prototype}} +// function definition with 0 params, prototype. void foo2(void) {} // function type typedef unspecified params diff --git a/clang/test/Sema/warn-strict-prototypes.cpp b/clang/test/Sema/warn-strict-prototypes.cpp new file mode 100644 index 0000000000000..6a3839ff93672 --- /dev/null +++ b/clang/test/Sema/warn-strict-prototypes.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -verify -fsyntax-only -Wstrict-prototypes %s +// expected-no-diagnostics + +void decl(); +void decl_void(void); + +void def() {} +void def_void(void) {} diff --git a/clang/test/Sema/warn-strict-prototypes.m b/clang/test/Sema/warn-strict-prototypes.m index 66d574f75f802..e2fde8ee38fc1 100644 --- a/clang/test/Sema/warn-strict-prototypes.m +++ b/clang/test/Sema/warn-strict-prototypes.m @@ -10,7 +10,7 @@ @interface Foo @end -void foo() { +void foo() { // expected-warning {{this old-style function definition is not preceded by a prototype}} void (^block)() = // expected-warning {{this block declaration is not a prototype}} ^void(int arg) { // no warning }; diff --git a/clang/test/Sema/warn-unused-parameters.c b/clang/test/Sema/warn-unused-parameters.c index 11db7300c5c43..d325f887f885a 100644 --- a/clang/test/Sema/warn-unused-parameters.c +++ b/clang/test/Sema/warn-unused-parameters.c @@ -7,7 +7,7 @@ int f0(int x, return x; } -void f1() { +void f1(void) { (void)^(int x, int y, int z __attribute__((unused))) { return x; }; diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c b/clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c rename to clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c.expected similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c.expected rename to clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c.expected diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.expected rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/def-and-decl.test b/clang/test/utils/update_cc_test_checks/def-and-decl.test similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/def-and-decl.test rename to clang/test/utils/update_cc_test_checks/def-and-decl.test diff --git a/clang/test/utils/update_cc_test_checks/lit.local.cfg b/clang/test/utils/update_cc_test_checks/lit.local.cfg new file mode 100644 index 0000000000000..0250446423cb5 --- /dev/null +++ b/clang/test/utils/update_cc_test_checks/lit.local.cfg @@ -0,0 +1,25 @@ +import os + +import lit.util + +# python 2.7 backwards compatibility +try: + from shlex import quote as shell_quote +except ImportError: + from pipes import quote as shell_quote + + +config.test_format = lit.formats.ShTest(execute_external=False) +config.suffixes = ['.test'] + +clang_path = os.path.join(config.clang_tools_dir, 'clang') +extra_args = '--clang ' + shell_quote(clang_path) +opt_path = os.path.join(config.llvm_tools_dir, 'opt') +extra_args += ' --opt ' + shell_quote(opt_path) +script_path = os.path.join(config.llvm_src_root, 'utils', + 'update_cc_test_checks.py') +assert os.path.isfile(script_path) +config.substitutions.append( + ('%update_cc_test_checks', "%s %s %s" % ( + shell_quote(config.python_executable), shell_quote(script_path), + extra_args))) diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/mangled_names.test b/clang/test/utils/update_cc_test_checks/mangled_names.test similarity index 100% rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/mangled_names.test rename to clang/test/utils/update_cc_test_checks/mangled_names.test diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 50788cb7cf8b6..9105c616786fb 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -18,6 +18,7 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/Threading.h" #include #include @@ -484,14 +485,9 @@ int main(int argc, const char **argv) { DependencyScanningService Service(ScanMode, Format, ReuseFileManager, SkipExcludedPPRanges); -#if LLVM_ENABLE_THREADS - unsigned NumWorkers = - NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads; -#else - unsigned NumWorkers = 1; -#endif + llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads)); std::vector> WorkerTools; - for (unsigned I = 0; I < NumWorkers; ++I) + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) WorkerTools.push_back(std::make_unique(Service)); std::vector Inputs; @@ -499,7 +495,6 @@ int main(int argc, const char **argv) { AdjustingCompilations->getAllCompileCommands()) Inputs.emplace_back(Cmd); - std::vector WorkerThreads; std::atomic HadErrors(false); FullDeps FD; std::mutex Lock; @@ -507,11 +502,11 @@ int main(int argc, const char **argv) { if (Verbose) { llvm::outs() << "Running clang-scan-deps on " << Inputs.size() - << " files using " << NumWorkers << " workers\n"; + << " files using " << Pool.getThreadCount() << " workers\n"; } - for (unsigned I = 0; I < NumWorkers; ++I) { - auto Worker = [I, &Lock, &Index, &Inputs, &HadErrors, &FD, &WorkerTools, - &DependencyOS, &Errs]() { + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) { + Pool.async([I, &Lock, &Index, &Inputs, &HadErrors, &FD, &WorkerTools, + &DependencyOS, &Errs]() { llvm::StringSet<> AlreadySeenModules; while (true) { const SingleCommandCompilationDatabase *Input; @@ -543,16 +538,9 @@ int main(int argc, const char **argv) { HadErrors = true; } } - }; -#if LLVM_ENABLE_THREADS - WorkerThreads.emplace_back(std::move(Worker)); -#else - // Run the worker without spawning a thread when threads are disabled. - Worker(); -#endif + }); } - for (auto &W : WorkerThreads) - W.join(); + Pool.wait(); if (Format == ScanningOutputFormat::Full) FD.printFullOutput(llvm::outs()); diff --git a/clang/tools/libclang/CXIndexDataConsumer.cpp b/clang/tools/libclang/CXIndexDataConsumer.cpp index ad871228ccdfb..fb04a06f8ae7e 100644 --- a/clang/tools/libclang/CXIndexDataConsumer.cpp +++ b/clang/tools/libclang/CXIndexDataConsumer.cpp @@ -1245,6 +1245,9 @@ static CXIdxEntityKind getEntityKindFromSymbolKind(SymbolKind K, SymbolLanguage case SymbolKind::Macro: case SymbolKind::ClassProperty: case SymbolKind::Using: + case SymbolKind::TemplateTypeParm: + case SymbolKind::TemplateTemplateParm: + case SymbolKind::NonTypeTemplateParm: return CXIdxEntity_Unexposed; case SymbolKind::Enum: return CXIdxEntity_Enum; diff --git a/clang/unittests/Index/IndexTests.cpp b/clang/unittests/Index/IndexTests.cpp index a279f48fbb375..068b30ebfa8af 100644 --- a/clang/unittests/Index/IndexTests.cpp +++ b/clang/unittests/Index/IndexTests.cpp @@ -249,8 +249,13 @@ TEST(IndexTest, IndexTypeParmDecls) { Index->Symbols.clear(); tooling::runToolOnCode(std::make_unique(Index, Opts), Code); EXPECT_THAT(Index->Symbols, - AllOf(Contains(QName("Foo::T")), Contains(QName("Foo::I")), - Contains(QName("Foo::C")), Contains(QName("Foo::NoRef")))); + AllOf(Contains(AllOf(QName("Foo::T"), + Kind(SymbolKind::TemplateTypeParm))), + Contains(AllOf(QName("Foo::I"), + Kind(SymbolKind::NonTypeTemplateParm))), + Contains(AllOf(QName("Foo::C"), + Kind(SymbolKind::TemplateTemplateParm))), + Contains(QName("Foo::NoRef")))); } TEST(IndexTest, UsingDecls) { diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index c3b22e067c9be..ba825a90edb29 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -107,6 +107,7 @@ static std::string ReadPCHRecord(StringRef type) { .Case("IdentifierInfo *", "Record.readIdentifier()") .Case("StringRef", "Record.readString()") .Case("ParamIdx", "ParamIdx::deserialize(Record.readInt())") + .Case("OMPTraitInfo *", "Record.readOMPTraitInfo()") .Default("Record.readInt()"); } @@ -130,6 +131,8 @@ static std::string WritePCHRecord(StringRef type, StringRef name) { .Case("StringRef", "AddString(" + std::string(name) + ");\n") .Case("ParamIdx", "push_back(" + std::string(name) + ".serialize());\n") + .Case("OMPTraitInfo *", + "writeOMPTraitInfo(" + std::string(name) + ");\n") .Default("push_back(" + std::string(name) + ");\n"); } @@ -338,7 +341,7 @@ namespace { void writeDump(raw_ostream &OS) const override { if (type == "FunctionDecl *" || type == "NamedDecl *") { OS << " OS << \" \";\n"; - OS << " dumpBareDeclRef(SA->get" << getUpperName() << "());\n"; + OS << " dumpBareDeclRef(SA->get" << getUpperName() << "());\n"; } else if (type == "IdentifierInfo *") { // Some non-optional (comma required) identifier arguments can be the // empty string but are then recorded as a nullptr. @@ -360,6 +363,8 @@ namespace { OS << " if (SA->get" << getUpperName() << "().isValid())\n "; OS << " OS << \" \" << SA->get" << getUpperName() << "().getSourceIndex();\n"; + } else if (type == "OMPTraitInfo *") { + OS << " OS << \" \" << *SA->get" << getUpperName() << "();\n"; } else { llvm_unreachable("Unknown SimpleArgument type!"); } @@ -500,7 +505,7 @@ namespace { OS << " if (is" << getLowerName() << "Expr)\n"; OS << " return " << getLowerName() << "Expr && (" << getLowerName() << "Expr->isValueDependent() || " << getLowerName() - << "Expr->isTypeDependent());\n"; + << "Expr->isTypeDependent());\n"; OS << " else\n"; OS << " return " << getLowerName() << "Type->getType()->isDependentType();\n"; @@ -525,11 +530,11 @@ namespace { void writeASTVisitorTraversal(raw_ostream &OS) const override { StringRef Name = getUpperName(); OS << " if (A->is" << Name << "Expr()) {\n" - << " if (!getDerived().TraverseStmt(A->get" << Name << "Expr()))\n" - << " return false;\n" + << " if (!getDerived().TraverseStmt(A->get" << Name << "Expr()))\n" + << " return false;\n" << " } else if (auto *TSI = A->get" << Name << "Type()) {\n" << " if (!getDerived().TraverseTypeLoc(TSI->getTypeLoc()))\n" - << " return false;\n" + << " return false;\n" << " }\n"; } @@ -658,7 +663,7 @@ namespace { std::string IteratorType = getLowerName().str() + "_iterator"; std::string BeginFn = getLowerName().str() + "_begin()"; std::string EndFn = getLowerName().str() + "_end()"; - + OS << " typedef " << Type << "* " << IteratorType << ";\n"; OS << " " << IteratorType << " " << BeginFn << " const {" << " return " << ArgName << "; }\n"; @@ -915,14 +920,14 @@ namespace { for (size_t I = 0; I < enums.size(); ++I) { if (Uniques.insert(enums[I]).second) OS << " case " << getAttrName() << "Attr::" << enums[I] - << ": return \"" << values[I] << "\";\n"; + << ": return \"" << values[I] << "\";\n"; } OS << " }\n" << " llvm_unreachable(\"No enumerator with that value\");\n" << " }\n"; } }; - + class VariadicEnumArgument: public VariadicArgument { std::string type, QualifiedTypeName; std::vector values, enums, uniques; @@ -945,13 +950,13 @@ namespace { enums(Arg.getValueAsListOfStrings("Enums")), uniques(uniqueEnumsInOrder(enums)) { QualifiedTypeName = getAttrName().str() + "Attr::" + type; - + // FIXME: Emit a proper error assert(!uniques.empty()); } bool isVariadicEnumArg() const override { return true; } - + void writeDeclarations(raw_ostream &OS) const override { auto i = uniques.cbegin(), e = uniques.cend(); // The last one needs to not have a comma. @@ -964,7 +969,7 @@ namespace { OS << " " << *e << "\n"; OS << " };\n"; OS << "private:\n"; - + VariadicArgument::writeDeclarations(OS); } @@ -1041,7 +1046,7 @@ namespace { OS << " VersionTuple get" << getUpperName() << "() const {\n"; OS << " return " << getLowerName() << ";\n"; OS << " }\n"; - OS << " void set" << getUpperName() + OS << " void set" << getUpperName() << "(ASTContext &C, VersionTuple V) {\n"; OS << " " << getLowerName() << " = V;\n"; OS << " }"; @@ -1308,6 +1313,8 @@ createArgument(const Record &Arg, StringRef Attr, Ptr = std::make_unique(Arg, Attr); else if (ArgName == "VersionArgument") Ptr = std::make_unique(Arg, Attr); + else if (ArgName == "OMPTraitInfoArgument") + Ptr = std::make_unique(Arg, Attr, "OMPTraitInfo *"); if (!Ptr) { // Search in reverse order so that the most-derived type is handled first. @@ -2252,10 +2259,10 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) { // When attribute documentation can be generated as part of the build // itself, this code can be removed. (void)R.getValueAsListOfDefs("Documentation"); - + if (!R.getValueAsBit("ASTNode")) continue; - + ArrayRef> Supers = R.getSuperClasses(); assert(!Supers.empty() && "Forgot to specify a superclass for the attr"); std::string SuperName; @@ -2437,7 +2444,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) { } OS << " {\n"; - + for (auto const &ai : Args) { if (!shouldEmitArg(ai)) continue; ai->writeCtorBody(OS); @@ -2452,7 +2459,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) { // Emit a constructor that takes all the non-fake arguments. if (HasFakeArg) emitCtor(true, false); - + // Emit a constructor that takes all the non-fake, non-optional arguments. if (HasOptArg) emitCtor(false, false); @@ -2461,7 +2468,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) { OS << " void printPretty(raw_ostream &OS,\n" << " const PrintingPolicy &Policy) const;\n"; OS << " const char *getSpelling() const;\n"; - + if (!ElideSpelling) { assert(!SemanticToSyntacticMap.empty() && "Empty semantic mapping list"); OS << " Spelling getSemanticSpelling() const {\n"; @@ -2506,7 +2513,7 @@ void clang::EmitClangAttrImpl(RecordKeeper &Records, raw_ostream &OS) { for (auto *Attr : Attrs) { Record &R = *Attr; - + if (!R.getValueAsBit("ASTNode")) continue; @@ -2978,7 +2985,7 @@ static void GenerateHasAttrSpellingStringSwitch( // them. If the attribute has no scope, the version information must not // have the default value (1), as that's incorrect. Instead, the unscoped // attribute version information should be taken from the SD-6 standing - // document, which can be found at: + // document, which can be found at: // https://isocpp.org/std/standing-documents/sd-6-sg10-feature-test-recommendations int Version = 1; @@ -3270,7 +3277,7 @@ void EmitClangAttrParsedAttrList(RecordKeeper &Records, raw_ostream &OS) { OS << "#ifndef PARSED_ATTR\n"; OS << "#define PARSED_ATTR(NAME) NAME\n"; OS << "#endif\n\n"; - + ParsedAttrMap Names = getParsedAttrList(Records); for (const auto &I : Names) { OS << "PARSED_ATTR(" << I.first << ")\n"; diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 6c4856f05a2dc..21af345dc6bb8 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -342,21 +342,33 @@ if(APPLE) if(COMPILER_RT_ENABLE_IOS) list(APPEND DARWIN_EMBEDDED_PLATFORMS ios) + set(DARWIN_ios_MIN_VER 9.0) set(DARWIN_ios_MIN_VER_FLAG -miphoneos-version-min) set(DARWIN_ios_SANITIZER_MIN_VER_FLAG - ${DARWIN_ios_MIN_VER_FLAG}=9.0) + ${DARWIN_ios_MIN_VER_FLAG}=${DARWIN_ios_MIN_VER}) + set(DARWIN_iossim_MIN_VER_FLAG -mios-simulator-version-min) + set(DARWIN_iossim_SANITIZER_MIN_VER_FLAG + ${DARWIN_iossim_MIN_VER_FLAG}=${DARWIN_ios_MIN_VER}) endif() if(COMPILER_RT_ENABLE_WATCHOS) list(APPEND DARWIN_EMBEDDED_PLATFORMS watchos) + set(DARWIN_watchos_MIN_VER 2.0) set(DARWIN_watchos_MIN_VER_FLAG -mwatchos-version-min) set(DARWIN_watchos_SANITIZER_MIN_VER_FLAG - ${DARWIN_watchos_MIN_VER_FLAG}=2.0) + ${DARWIN_watchos_MIN_VER_FLAG}=${DARWIN_watchos_MIN_VER}) + set(DARWIN_watchossim_MIN_VER_FLAG -mwatchos-simulator-version-min) + set(DARWIN_watchossim_SANITIZER_MIN_VER_FLAG + ${DARWIN_watchossim_MIN_VER_FLAG}=${DARWIN_watchos_MIN_VER}) endif() if(COMPILER_RT_ENABLE_TVOS) list(APPEND DARWIN_EMBEDDED_PLATFORMS tvos) + set(DARWIN_tvos_MIN_VER 9.0) set(DARWIN_tvos_MIN_VER_FLAG -mtvos-version-min) set(DARWIN_tvos_SANITIZER_MIN_VER_FLAG - ${DARWIN_tvos_MIN_VER_FLAG}=9.0) + ${DARWIN_tvos_MIN_VER_FLAG}=${DARWIN_tvos_MIN_VER}) + set(DARWIN_tvossim_MIN_VER_FLAG -mtvos-simulator-version-min) + set(DARWIN_tvossim_SANITIZER_MIN_VER_FLAG + ${DARWIN_tvossim_MIN_VER_FLAG}=${DARWIN_tvos_MIN_VER}) endif() set(SANITIZER_COMMON_SUPPORTED_OS osx) @@ -368,8 +380,9 @@ if(APPLE) # Note: In order to target x86_64h on OS X the minimum deployment target must # be 10.8 or higher. set(DEFAULT_SANITIZER_MIN_OSX_VERSION 10.10) + set(DARWIN_osx_MIN_VER_FLAG "-mmacosx-version-min") if(NOT SANITIZER_MIN_OSX_VERSION) - string(REGEX MATCH "-mmacosx-version-min=([.0-9]+)" + string(REGEX MATCH "${DARWIN_osx_MIN_VER_FLAG}=([.0-9]+)" MACOSX_VERSION_MIN_FLAG "${CMAKE_CXX_FLAGS}") if(MACOSX_VERSION_MIN_FLAG) set(SANITIZER_MIN_OSX_VERSION "${CMAKE_MATCH_1}") @@ -403,10 +416,10 @@ if(APPLE) set(DARWIN_osx_CFLAGS ${DARWIN_COMMON_CFLAGS} - -mmacosx-version-min=${SANITIZER_MIN_OSX_VERSION}) + ${DARWIN_osx_MIN_VER_FLAG}=${SANITIZER_MIN_OSX_VERSION}) set(DARWIN_osx_LINK_FLAGS ${DARWIN_COMMON_LINK_FLAGS} - -mmacosx-version-min=${SANITIZER_MIN_OSX_VERSION}) + ${DARWIN_osx_MIN_VER_FLAG}=${SANITIZER_MIN_OSX_VERSION}) if(DARWIN_osx_SYSROOT) list(APPEND DARWIN_osx_CFLAGS -isysroot ${DARWIN_osx_SYSROOT}) @@ -431,11 +444,11 @@ if(APPLE) if(DARWIN_${platform}sim_SYSROOT) set(DARWIN_${platform}sim_CFLAGS ${DARWIN_COMMON_CFLAGS} - ${DARWIN_${platform}_SANITIZER_MIN_VER_FLAG} + ${DARWIN_${platform}sim_SANITIZER_MIN_VER_FLAG} -isysroot ${DARWIN_${platform}sim_SYSROOT}) set(DARWIN_${platform}sim_LINK_FLAGS ${DARWIN_COMMON_LINK_FLAGS} - ${DARWIN_${platform}_SANITIZER_MIN_VER_FLAG} + ${DARWIN_${platform}sim_SANITIZER_MIN_VER_FLAG} -isysroot ${DARWIN_${platform}sim_SYSROOT}) set(DARWIN_${platform}sim_SKIP_CC_KEXT On) @@ -487,6 +500,10 @@ if(APPLE) endforeach() endif() + # Explictly disable unsupported Sanitizer configurations. + list(REMOVE_ITEM FUZZER_SUPPORTED_OS "watchos") + list(REMOVE_ITEM FUZZER_SUPPORTED_OS "watchossim") + # for list_intersect include(CompilerRTUtils) diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h index 3d338501ae4ae..ad2a17ef7014a 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.h +++ b/compiler-rt/lib/scudo/standalone/allocator_config.h @@ -40,15 +40,15 @@ struct AndroidConfig { using SizeClassMap = AndroidSizeClassMap; #if SCUDO_CAN_USE_PRIMARY64 // 256MB regions - typedef SizeClassAllocator64 Primary; #else // 256KB regions - typedef SizeClassAllocator32 Primary; + typedef SizeClassAllocator32 Primary; #endif // Cache blocks up to 2MB - typedef MapAllocator> Secondary; + typedef MapAllocator> Secondary; template using TSDRegistryT = TSDRegistrySharedT; // Shared, max 2 TSDs. }; @@ -57,12 +57,12 @@ struct AndroidSvelteConfig { using SizeClassMap = SvelteSizeClassMap; #if SCUDO_CAN_USE_PRIMARY64 // 128MB regions - typedef SizeClassAllocator64 Primary; + typedef SizeClassAllocator64 Primary; #else // 64KB regions - typedef SizeClassAllocator32 Primary; + typedef SizeClassAllocator32 Primary; #endif - typedef MapAllocator> Secondary; + typedef MapAllocator> Secondary; template using TSDRegistryT = TSDRegistrySharedT; // Shared, only 1 TSD. }; diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index e8390a7b44f16..f49fc9aac84cb 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -32,6 +32,8 @@ extern "C" inline void EmptyCallback() {} namespace scudo { +enum class Option { ReleaseInterval }; + template class Allocator { public: @@ -624,8 +626,14 @@ class Allocator { return Options.MayReturnNull; } - // TODO(kostyak): implement this as a "backend" to mallopt. - bool setOption(UNUSED uptr Option, UNUSED uptr Value) { return false; } + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + Primary.setReleaseToOsIntervalMs(static_cast(Value)); + Secondary.setReleaseToOsIntervalMs(static_cast(Value)); + return true; + } + return false; + } // Return the usable size for a given chunk. Technically we lie, as we just // report the actual size of a chunk. This is done to counteract code actively diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc index 27aa969e608ac..342af1c79ad64 100644 --- a/compiler-rt/lib/scudo/standalone/flags.inc +++ b/compiler-rt/lib/scudo/standalone/flags.inc @@ -45,6 +45,6 @@ SCUDO_FLAG(bool, may_return_null, true, "returning NULL in otherwise non-fatal error scenarios, eg: OOM, " "invalid allocation alignments, etc.") -SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? 1000 : 5000, +SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000, "Interval (in milliseconds) at which to attempt release of unused " "memory to the OS. Negative values disable the feature.") diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index 294043930e862..79345cb348b64 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -38,14 +38,18 @@ namespace scudo { // Memory used by this allocator is never unmapped but can be partially // reclaimed if the platform allows for it. -template class SizeClassAllocator32 { +template class SizeClassAllocator32 { public: typedef SizeClassMapT SizeClassMap; // The bytemap can only track UINT8_MAX - 1 classes. static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), ""); // Regions should be large enough to hold the largest Block. static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, ""); - typedef SizeClassAllocator32 ThisT; + typedef SizeClassAllocator32 ThisT; typedef SizeClassAllocatorLocalCache CacheT; typedef typename CacheT::TransferBatch TransferBatch; static const bool SupportsMemoryTagging = false; @@ -78,7 +82,7 @@ template class SizeClassAllocator32 { Sci->CanRelease = (I != SizeClassMap::BatchClassId) && (getSizeByClassId(I) >= (PageSize / 32)); } - ReleaseToOsIntervalMs = ReleaseToOsInterval; + setReleaseToOsIntervalMs(ReleaseToOsInterval); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -176,6 +180,15 @@ template class SizeClassAllocator32 { getStats(Str, I, 0); } + void setReleaseToOsIntervalMs(s32 Interval) { + if (Interval >= MaxReleaseToOsIntervalMs) { + Interval = MaxReleaseToOsIntervalMs; + } else if (Interval <= MinReleaseToOsIntervalMs) { + Interval = MinReleaseToOsIntervalMs; + } + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + } + uptr releaseToOS() { uptr TotalReleasedBytes = 0; for (uptr I = 0; I < NumClasses; I++) { @@ -356,6 +369,10 @@ template class SizeClassAllocator32 { AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased); } + s32 getReleaseToOsIntervalMs() { + return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); + } + NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -374,7 +391,7 @@ template class SizeClassAllocator32 { } if (!Force) { - const s32 IntervalMs = ReleaseToOsIntervalMs; + const s32 IntervalMs = getReleaseToOsIntervalMs(); if (IntervalMs < 0) return 0; if (Sci->ReleaseInfo.LastReleaseAtNs + @@ -414,7 +431,7 @@ template class SizeClassAllocator32 { // through the whole NumRegions. uptr MinRegionIndex; uptr MaxRegionIndex; - s32 ReleaseToOsIntervalMs; + atomic_s32 ReleaseToOsIntervalMs; // Unless several threads request regions simultaneously from different size // classes, the stash rarely contains more than 1 entry. static constexpr uptr MaxStashedRegions = 4; diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 9d8dcac6562a0..bc31db88ebb8b 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -40,11 +40,15 @@ namespace scudo { // released if the platform allows for it. template class SizeClassAllocator64 { public: typedef SizeClassMapT SizeClassMap; typedef SizeClassAllocator64 ThisT; typedef SizeClassAllocatorLocalCache CacheT; @@ -90,7 +94,7 @@ class SizeClassAllocator64 { (getSizeByClassId(I) >= (PageSize / 32)); Region->RandState = getRandomU32(&Seed); } - ReleaseToOsIntervalMs = ReleaseToOsInterval; + setReleaseToOsIntervalMs(ReleaseToOsInterval); if (SupportsMemoryTagging) UseMemoryTagging = systemSupportsMemoryTagging(); @@ -186,6 +190,15 @@ class SizeClassAllocator64 { getStats(Str, I, 0); } + void setReleaseToOsIntervalMs(s32 Interval) { + if (Interval >= MaxReleaseToOsIntervalMs) { + Interval = MaxReleaseToOsIntervalMs; + } else if (Interval <= MinReleaseToOsIntervalMs) { + Interval = MinReleaseToOsIntervalMs; + } + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + } + uptr releaseToOS() { uptr TotalReleasedBytes = 0; for (uptr I = 0; I < NumClasses; I++) { @@ -241,7 +254,7 @@ class SizeClassAllocator64 { uptr PrimaryBase; RegionInfo *RegionInfoArray; MapPlatformData Data; - s32 ReleaseToOsIntervalMs; + atomic_s32 ReleaseToOsIntervalMs; bool UseMemoryTagging; RegionInfo *getRegionInfo(uptr ClassId) const { @@ -375,6 +388,10 @@ class SizeClassAllocator64 { getRegionBaseByClassId(ClassId)); } + s32 getReleaseToOsIntervalMs() { + return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); + } + NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -394,7 +411,7 @@ class SizeClassAllocator64 { } if (!Force) { - const s32 IntervalMs = ReleaseToOsIntervalMs; + const s32 IntervalMs = getReleaseToOsIntervalMs(); if (IntervalMs < 0) return 0; if (Region->ReleaseInfo.LastReleaseAtNs + diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index deba7a930d986..8ae8108b2eaad 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -62,7 +62,9 @@ class MapAllocatorNoCache { void releaseToOS() {} }; -template +template class MapAllocatorCache { public: // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length @@ -71,7 +73,7 @@ class MapAllocatorCache { static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, ""); void initLinkerInitialized(s32 ReleaseToOsInterval) { - ReleaseToOsIntervalMs = ReleaseToOsInterval; + setReleaseToOsIntervalMs(ReleaseToOsInterval); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -105,11 +107,11 @@ class MapAllocatorCache { } } } + s32 Interval; if (EmptyCache) empty(); - else if (ReleaseToOsIntervalMs >= 0) - releaseOlderThan(Time - - static_cast(ReleaseToOsIntervalMs) * 1000000); + else if ((Interval = getReleaseToOsIntervalMs()) >= 0) + releaseOlderThan(Time - static_cast(Interval) * 1000000); return EntryCached; } @@ -142,6 +144,15 @@ class MapAllocatorCache { return MaxEntriesCount != 0U && Size <= MaxEntrySize; } + void setReleaseToOsIntervalMs(s32 Interval) { + if (Interval >= MaxReleaseToOsIntervalMs) { + Interval = MaxReleaseToOsIntervalMs; + } else if (Interval <= MinReleaseToOsIntervalMs) { + Interval = MinReleaseToOsIntervalMs; + } + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + } + void releaseToOS() { releaseOlderThan(UINT64_MAX); } void disable() { Mutex.lock(); } @@ -189,6 +200,10 @@ class MapAllocatorCache { } } + s32 getReleaseToOsIntervalMs() { + return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); + } + struct CachedBlock { uptr Block; uptr BlockEnd; @@ -203,7 +218,7 @@ class MapAllocatorCache { u32 EntriesCount; uptr LargestSize; u32 IsFullEvents; - s32 ReleaseToOsIntervalMs; + atomic_s32 ReleaseToOsIntervalMs; }; template class MapAllocator { @@ -251,6 +266,10 @@ template class MapAllocator { static uptr canCache(uptr Size) { return CacheT::canCache(Size); } + void setReleaseToOsIntervalMs(s32 Interval) { + Cache.setReleaseToOsIntervalMs(Interval); + } + void releaseToOS() { Cache.releaseToOS(); } private: diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc index 91f615dcb8f84..314a835074e64 100644 --- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc +++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc @@ -157,7 +157,18 @@ void SCUDO_PREFIX(malloc_postinit)() { INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) { if (param == M_DECAY_TIME) { - // TODO(kostyak): set release_to_os_interval_ms accordingly. + if (SCUDO_ANDROID) { + if (value == 0) { + // Will set the release values to their minimum values. + value = INT32_MIN; + } else { + // Will set the release values to their maximum values. + value = INT32_MAX; + } + } + + SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval, + static_cast(value)); return 1; } else if (param == M_PURGE) { SCUDO_ALLOCATOR.releaseToOS(); diff --git a/compiler-rt/test/asan/CMakeLists.txt b/compiler-rt/test/asan/CMakeLists.txt index f756064f47e05..1c2633eb4597b 100644 --- a/compiler-rt/test/asan/CMakeLists.txt +++ b/compiler-rt/test/asan/CMakeLists.txt @@ -44,6 +44,7 @@ endif() foreach(arch ${ASAN_TEST_ARCH}) set(ASAN_TEST_TARGET_ARCH ${arch}) set(ASAN_TEST_APPLE_PLATFORM "osx") + set(ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}") string(TOLOWER "-${arch}-${OS_NAME}" ASAN_TEST_CONFIG_SUFFIX) get_bits_for_arch(${arch} ASAN_TEST_BITS) get_test_cc_for_arch(${arch} ASAN_TEST_TARGET_CC ASAN_TEST_TARGET_CFLAGS) @@ -104,6 +105,7 @@ if(APPLE) set(ASAN_TEST_CONFIG_SUFFIX "-${arch}-${platform}") set(ASAN_TEST_APPLE_PLATFORM "${platform}") set(ASAN_TEST_TARGET_ARCH "${arch}") + set(ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}") get_bits_for_arch(${arch} ASAN_TEST_BITS) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in diff --git a/compiler-rt/test/asan/lit.site.cfg.py.in b/compiler-rt/test/asan/lit.site.cfg.py.in index f76b306f8577d..81cebde2029ed 100644 --- a/compiler-rt/test/asan/lit.site.cfg.py.in +++ b/compiler-rt/test/asan/lit.site.cfg.py.in @@ -6,6 +6,7 @@ config.target_cflags = "@ASAN_TEST_TARGET_CFLAGS@" config.clang = "@ASAN_TEST_TARGET_CC@" config.bits = "@ASAN_TEST_BITS@" config.apple_platform = "@ASAN_TEST_APPLE_PLATFORM@" +config.apple_platform_min_deployment_target_flag = "@ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@" config.asan_dynamic = @ASAN_TEST_DYNAMIC@ config.target_arch = "@ASAN_TEST_TARGET_ARCH@" diff --git a/compiler-rt/test/fuzzer/CMakeLists.txt b/compiler-rt/test/fuzzer/CMakeLists.txt index 5a027bd07b40b..c12a04b6f2702 100644 --- a/compiler-rt/test/fuzzer/CMakeLists.txt +++ b/compiler-rt/test/fuzzer/CMakeLists.txt @@ -53,6 +53,7 @@ macro(test_fuzzer stdlib) set(LIBFUZZER_TEST_TARGET_ARCH ${arch}) set(LIBFUZZER_TEST_APPLE_PLATFORM "osx") + set(LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}") set(LIBFUZZER_TEST_STDLIB ${stdlib}) @@ -113,6 +114,7 @@ if (APPLE) set(LIBFUZZER_TEST_CONFIG_SUFFIX "-${arch}-${platform}") set(LIBFUZZER_TEST_APPLE_PLATFORM "${platform}") set(LIBFUZZER_TEST_TARGET_ARCH "${arch}") + set(LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}") configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py diff --git a/compiler-rt/test/fuzzer/lit.site.cfg.py.in b/compiler-rt/test/fuzzer/lit.site.cfg.py.in index cc6a0908d142c..218688c182c7c 100644 --- a/compiler-rt/test/fuzzer/lit.site.cfg.py.in +++ b/compiler-rt/test/fuzzer/lit.site.cfg.py.in @@ -5,6 +5,7 @@ config.target_flags = "@LIBFUZZER_TEST_FLAGS@" config.c_compiler = "@LIBFUZZER_TEST_COMPILER@" config.stdlib = "@LIBFUZZER_TEST_STDLIB@" config.apple_platform = "@LIBFUZZER_TEST_APPLE_PLATFORM@" +config.apple_platform_min_deployment_target_flag = "@LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG@" config.name_suffix = "@LIBFUZZER_TEST_CONFIG_SUFFIX@" config.osx_sysroot_flag = "@OSX_SYSROOT_FLAG@" diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index bf2190a93dad3..6c4a6f526551e 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -258,6 +258,44 @@ lit.util.usePlatformSdkOnDarwin(config, lit_config) if config.host_os == 'Darwin': + def get_apple_platform_version_aligned_with(macos_version, apple_platform): + """ + Given a macOS version (`macos_version`) returns the corresponding version for + the specified Apple platform if it exists. + + `macos_version` - The macOS version as a string. + `apple_platform` - The Apple platform name as a string. + + Returns the corresponding version as a string if it exists, otherwise + `None` is returned. + """ + m = re.match(r'^10\.(?P\d+)(\.(?P\d+))?$', macos_version) + if not m: + raise Exception('Could not parse macOS version: "{}"'.format(macos_version)) + ver_min = int(m.group('min')) + ver_patch = m.group('patch') + if ver_patch: + ver_patch = int(ver_patch) + else: + ver_patch = 0 + result_str = '' + if apple_platform == 'osx': + # Drop patch for now. + result_str = '10.{}'.format(ver_min) + elif apple_platform.startswith('ios') or apple_platform.startswith('tvos'): + result_maj = ver_min - 2 + if result_maj < 1: + return None + result_str = '{}.{}'.format(result_maj, ver_patch) + elif apple_platform.startswith('watch'): + result_maj = ver_min - 9 + if result_maj < 1: + return None + result_str = '{}.{}'.format(result_maj, ver_patch) + else: + raise Exception('Unsuported apple platform "{}"'.format(apple_platform)) + return result_str + osx_version = (10, 0, 0) try: osx_version = subprocess.check_output(["sw_vers", "-productVersion"]) @@ -288,12 +326,17 @@ except: pass - config.substitutions.append( ("%macos_min_target_10_11", "-mmacosx-version-min=10.11") ) - - isIOS = config.apple_platform != "osx" + min_os_aligned_with_osx_10_11 = get_apple_platform_version_aligned_with('10.11', config.apple_platform) + min_os_aligned_with_osx_10_11_flag = '' + if min_os_aligned_with_osx_10_11: + min_os_aligned_with_osx_10_11_flag = '{flag}={version}'.format( + flag=config.apple_platform_min_deployment_target_flag, + version=min_os_aligned_with_osx_10_11) + else: + lit_config.warning('Could not find a version of {} that corresponds with macOS 10.11'.format(config.apple_platform)) + config.substitutions.append( ("%macos_min_target_10_11", min_os_aligned_with_osx_10_11_flag) ) # rdar://problem/22207160 - config.substitutions.append( ("%darwin_min_target_with_full_runtime_arc_support", - "-miphoneos-version-min=9.0" if isIOS else "-mmacosx-version-min=10.11") ) + config.substitutions.append( ("%darwin_min_target_with_full_runtime_arc_support", min_os_aligned_with_osx_10_11_flag) ) # 32-bit iOS simulator is deprecated and removed in latest Xcode. if config.apple_platform == "iossim": diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in index 0fb51741783e1..4de8d030070f3 100644 --- a/compiler-rt/test/lit.common.configured.in +++ b/compiler-rt/test/lit.common.configured.in @@ -29,6 +29,7 @@ set_default("compiler_rt_libdir", "@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@") set_default("emulator", "@COMPILER_RT_EMULATOR@") set_default("asan_shadow_scale", "@COMPILER_RT_ASAN_SHADOW_SCALE@") set_default("apple_platform", "osx") +set_default("apple_platform_min_deployment_target_flag", "-mmacosx-version-min") set_default("sanitizer_can_use_cxxabi", @SANITIZER_CAN_USE_CXXABI_PYBOOL@) set_default("has_lld", @COMPILER_RT_HAS_LLD_PYBOOL@) set_default("can_symbolize", @CAN_SYMBOLIZE@) diff --git a/compiler-rt/test/tsan/CMakeLists.txt b/compiler-rt/test/tsan/CMakeLists.txt index 7cc3537660a63..67e20e5ecedb0 100644 --- a/compiler-rt/test/tsan/CMakeLists.txt +++ b/compiler-rt/test/tsan/CMakeLists.txt @@ -30,6 +30,7 @@ endif() foreach(arch ${TSAN_TEST_ARCH}) set(TSAN_TEST_APPLE_PLATFORM "osx") + set(TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}") set(TSAN_TEST_TARGET_ARCH ${arch}) string(TOLOWER "-${arch}" TSAN_TEST_CONFIG_SUFFIX) @@ -77,6 +78,7 @@ if(APPLE) set(TSAN_TEST_CONFIG_SUFFIX "-${arch}-${platform}") set(TSAN_TEST_APPLE_PLATFORM "${platform}") set(TSAN_TEST_TARGET_ARCH "${arch}") + set(TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}") configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py diff --git a/compiler-rt/test/tsan/lit.site.cfg.py.in b/compiler-rt/test/tsan/lit.site.cfg.py.in index 5e8d610e5df08..c6d453aaee26f 100644 --- a/compiler-rt/test/tsan/lit.site.cfg.py.in +++ b/compiler-rt/test/tsan/lit.site.cfg.py.in @@ -4,6 +4,7 @@ config.name_suffix = "@TSAN_TEST_CONFIG_SUFFIX@" config.tsan_lit_source_dir = "@TSAN_LIT_SOURCE_DIR@" config.has_libcxx = @TSAN_HAS_LIBCXX@ config.apple_platform = "@TSAN_TEST_APPLE_PLATFORM@" +config.apple_platform_min_deployment_target_flag = "@TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@" config.target_cflags = "@TSAN_TEST_TARGET_CFLAGS@" config.target_arch = "@TSAN_TEST_TARGET_ARCH@" config.deflake_threshold = "@TSAN_TEST_DEFLAKE_THRESHOLD@" diff --git a/compiler-rt/test/ubsan/CMakeLists.txt b/compiler-rt/test/ubsan/CMakeLists.txt index 1ef554f0a88ae..f7ca0e5c04bb1 100644 --- a/compiler-rt/test/ubsan/CMakeLists.txt +++ b/compiler-rt/test/ubsan/CMakeLists.txt @@ -43,6 +43,10 @@ endif() foreach(arch ${UBSAN_TEST_ARCH}) set(UBSAN_TEST_TARGET_ARCH ${arch}) + if (APPLE) + set(UBSAN_TEST_APPLE_PLATFORM "osx") + set(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}") + endif() get_test_cc_for_arch(${arch} UBSAN_TEST_TARGET_CC UBSAN_TEST_TARGET_CFLAGS) add_ubsan_testsuites("Standalone" ubsan ${arch}) @@ -73,8 +77,10 @@ macro(add_ubsan_device_testsuite test_mode sanitizer platform arch) set(UBSAN_TEST_USE_THINLTO "False") if (APPLE) set(UBSAN_TEST_APPLE_PLATFORM "${platform}") + set(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}") else() unset(UBSAN_TEST_APPLE_PLATFORM) + unset(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG) endif() configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in diff --git a/compiler-rt/test/ubsan/lit.site.cfg.py.in b/compiler-rt/test/ubsan/lit.site.cfg.py.in index 6a29917b86ee4..4dfd5c5b5c1a9 100644 --- a/compiler-rt/test/ubsan/lit.site.cfg.py.in +++ b/compiler-rt/test/ubsan/lit.site.cfg.py.in @@ -8,6 +8,7 @@ config.target_arch = "@UBSAN_TEST_TARGET_ARCH@" config.use_lld = @UBSAN_TEST_USE_LLD@ config.use_thinlto = @UBSAN_TEST_USE_THINLTO@ config.apple_platform = "@UBSAN_TEST_APPLE_PLATFORM@" +config.apple_platform_min_deployment_target_flag = "@UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@" # Load common config for all compiler-rt lit tests. lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured") diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef index bd62d6db39e8e..87eee4bf5b424 100644 --- a/libcxx/include/cstddef +++ b/libcxx/include/cstddef @@ -57,6 +57,32 @@ using ::max_align_t; typedef long double max_align_t; #endif +template struct __libcpp_is_integral { enum { value = 0 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +#ifndef _LIBCPP_NO_HAS_CHAR8_T +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +#endif +#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +#endif // _LIBCPP_HAS_NO_UNICODE_CHARS +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +template <> struct __libcpp_is_integral { enum { value = 1 }; }; +#ifndef _LIBCPP_HAS_NO_INT128 +template <> struct __libcpp_is_integral<__int128_t> { enum { value = 1 }; }; +template <> struct __libcpp_is_integral<__uint128_t> { enum { value = 1 }; }; +#endif + _LIBCPP_END_NAMESPACE_STD #if _LIBCPP_STD_VER > 14 @@ -64,6 +90,11 @@ namespace std // purposefully not versioned { enum class byte : unsigned char {}; + +template struct __enable_if_integral_imp {}; +template <> struct __enable_if_integral_imp { using type = byte; }; +template using _EnableByteOverload = typename __enable_if_integral_imp<__libcpp_is_integral<_Tp>::value>::type; + constexpr byte operator| (byte __lhs, byte __rhs) noexcept { return static_cast( @@ -104,10 +135,31 @@ constexpr byte operator~ (byte __b) noexcept ~static_cast(__b) )); } - +template + constexpr _EnableByteOverload<_Integer> & + operator<<=(byte& __lhs, _Integer __shift) noexcept + { return __lhs = __lhs << __shift; } + +template + constexpr _EnableByteOverload<_Integer> + operator<< (byte __lhs, _Integer __shift) noexcept + { return static_cast(static_cast(static_cast(__lhs) << __shift)); } + +template + constexpr _EnableByteOverload<_Integer> & + operator>>=(byte& __lhs, _Integer __shift) noexcept + { return __lhs = __lhs >> __shift; } + +template + constexpr _EnableByteOverload<_Integer> + operator>> (byte __lhs, _Integer __shift) noexcept + { return static_cast(static_cast(static_cast(__lhs) >> __shift)); } + +template > + constexpr _Integer + to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); } } -#include // rest of byte #endif #endif // _LIBCPP_CSTDDEF diff --git a/libcxx/include/span b/libcxx/include/span index 82bcbff402b1e..1fe1496530e98 100644 --- a/libcxx/include/span +++ b/libcxx/include/span @@ -307,13 +307,13 @@ public: _LIBCPP_INLINE_VISIBILITY constexpr reference front() const noexcept { - static_assert(_Extent > 0, "span[].front() on empty span"); + _LIBCPP_ASSERT(!empty(), "span::front() on empty span"); return __data[0]; } _LIBCPP_INLINE_VISIBILITY constexpr reference back() const noexcept { - static_assert(_Extent > 0, "span[].back() on empty span"); + _LIBCPP_ASSERT(!empty(), "span::back() on empty span"); return __data[size()-1]; } diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index f8ee5648d3581..6b8b855afc650 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -735,34 +735,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_null_pointer_v // is_integral -template struct __libcpp_is_integral : public false_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -#ifndef _LIBCPP_NO_HAS_CHAR8_T -template <> struct __libcpp_is_integral : public true_type {}; -#endif -#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -#endif // _LIBCPP_HAS_NO_UNICODE_CHARS -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -template <> struct __libcpp_is_integral : public true_type {}; -#ifndef _LIBCPP_HAS_NO_INT128 -template <> struct __libcpp_is_integral<__int128_t> : public true_type {}; -template <> struct __libcpp_is_integral<__uint128_t> : public true_type {}; -#endif - template struct _LIBCPP_TEMPLATE_VIS is_integral - : public __libcpp_is_integral::type> {}; + : public _BoolConstant<__libcpp_is_integral::type>::value> {}; #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) template @@ -4046,29 +4020,7 @@ _LIBCPP_END_NAMESPACE_STD // std::byte namespace std // purposefully not versioned { -template - constexpr typename enable_if, byte>::type & - operator<<=(byte& __lhs, _Integer __shift) noexcept - { return __lhs = __lhs << __shift; } - -template - constexpr typename enable_if, byte>::type - operator<< (byte __lhs, _Integer __shift) noexcept - { return static_cast(static_cast(static_cast(__lhs) << __shift)); } - -template - constexpr typename enable_if, byte>::type & - operator>>=(byte& __lhs, _Integer __shift) noexcept - { return __lhs = __lhs >> __shift; } - -template - constexpr typename enable_if, byte>::type - operator>> (byte __lhs, _Integer __shift) noexcept - { return static_cast(static_cast(static_cast(__lhs) >> __shift)); } - -template - constexpr typename enable_if, _Integer>::type - to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); } + } #endif diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 27601769a83be..74813cc5016df 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -60,6 +60,7 @@ public: #include #include #include +#include #ifdef _LIBCPP_NO_EXCEPTIONS #include #endif diff --git a/libcxx/test/std/containers/views/span.elem/back.pass.cpp b/libcxx/test/std/containers/views/span.elem/back.pass.cpp index f2c0cf60dbe80..5bb9631aa90b1 100644 --- a/libcxx/test/std/containers/views/span.elem/back.pass.cpp +++ b/libcxx/test/std/containers/views/span.elem/back.pass.cpp @@ -30,7 +30,6 @@ constexpr bool testConstexprSpan(Span sp) return std::addressof(sp.back()) == sp.data() + sp.size() - 1; } - template void testRuntimeSpan(Span sp) { @@ -38,6 +37,12 @@ void testRuntimeSpan(Span sp) assert(std::addressof(sp.back()) == sp.data() + sp.size() - 1); } +template +void testEmptySpan(Span sp) +{ + if (!sp.empty()) + [[maybe_unused]] auto res = sp.back(); +} struct A{}; constexpr int iArr1[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -71,5 +76,8 @@ int main(int, char**) testRuntimeSpan(std::span (&s, 1)); testRuntimeSpan(std::span(&s, 1)); + std::span sp; + testEmptySpan(sp); + return 0; } diff --git a/libcxx/test/std/containers/views/span.elem/front.pass.cpp b/libcxx/test/std/containers/views/span.elem/front.pass.cpp index 7f18a2422b395..e17f7dd1576dd 100644 --- a/libcxx/test/std/containers/views/span.elem/front.pass.cpp +++ b/libcxx/test/std/containers/views/span.elem/front.pass.cpp @@ -38,6 +38,12 @@ void testRuntimeSpan(Span sp) assert(std::addressof(sp.front()) == sp.data()); } +template +void testEmptySpan(Span sp) +{ + if (!sp.empty()) + [[maybe_unused]] auto res = sp.front(); +} struct A{}; constexpr int iArr1[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -71,5 +77,8 @@ int main(int, char**) testRuntimeSpan(std::span (&s, 1)); testRuntimeSpan(std::span(&s, 1)); + std::span sp; + testEmptySpan(sp); + return 0; } diff --git a/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp b/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp index 657d17d9c4516..ef1779e1b45fe 100644 --- a/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp +++ b/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include // UNSUPPORTED: c++98, c++03, c++11, c++14 diff --git a/libcxx/test/std/numerics/c.math/abs.pass.cpp b/libcxx/test/std/numerics/c.math/abs.pass.cpp index 3993dd34318a7..03aae465c2573 100644 --- a/libcxx/test/std/numerics/c.math/abs.pass.cpp +++ b/libcxx/test/std/numerics/c.math/abs.pass.cpp @@ -47,7 +47,7 @@ int main(int, char**) { // On some systems char is unsigned. // If that is the case, we should just test signed char twice. - typedef typename std::conditional< + typedef std::conditional< std::is_signed::value, char, signed char >::type SignedChar; @@ -63,10 +63,10 @@ int main(int, char**) // Here there is no guarantee that int is larger than int8_t so we // use a helper type trait to conditional test against int. - test_abs::type>(); - test_abs::type>(); - test_abs::type>(); - test_abs::type>(); + test_abs::type>(); + test_abs::type>(); + test_abs::type>(); + test_abs::type>(); test_abs(); test_abs(); diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index bd01bc02617c8..3a0251c90692f 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -2747,8 +2747,8 @@ createSymbols(ArrayRef> nameAttrs, size_t numShards = 32; size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // A sharded map to uniquify symbols by name. std::vector> map(numShards); @@ -3191,8 +3191,8 @@ void MergeNoTailSection::finalizeContents() { // operations in the following tight loop. size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // Add section pieces to the builders. parallelForEachN(0, concurrency, [&](size_t threadId) { diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h index af4507bf3496b..1544ecc386c7b 100644 --- a/lldb/include/lldb/Core/PluginManager.h +++ b/lldb/include/lldb/Core/PluginManager.h @@ -22,14 +22,14 @@ #include #include -#define LLDB_PLUGIN(PluginName) \ +#define LLDB_PLUGIN_DEFINE(PluginName) \ namespace lldb_private { \ void lldb_initialize_##PluginName() { PluginName::Initialize(); } \ void lldb_terminate_##PluginName() { PluginName::Terminate(); } \ } // FIXME: Generate me with CMake -#define LLDB_PLUGIN_DECLARE(PluginName) \ +#define LLDB_PLUGIN_DECLARE(PluginName) \ namespace lldb_private { \ extern void lldb_initialize_##PluginName(); \ extern void lldb_terminate_##PluginName(); \ diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 6d82a9826b393..f73255d322409 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -99,18 +99,24 @@ bool IsOnlySpaces(const EditLineStringType &content) { static int GetOperation(HistoryOperation op) { // The naming used by editline for the history operations is counter - // intuitive to how it's used here. + // intuitive to how it's used in LLDB's editline implementation. + // + // - The H_LAST returns the oldest entry in the history. // // - The H_PREV operation returns the previous element in the history, which // is newer than the current one. // + // - The H_CURR returns the current entry in the history. + // // - The H_NEXT operation returns the next element in the history, which is // older than the current one. // + // - The H_FIRST returns the most recent entry in the history. + // // The naming of the enum entries match the semantic meaning. switch(op) { case HistoryOperation::Oldest: - return H_FIRST; + return H_LAST; case HistoryOperation::Older: return H_NEXT; case HistoryOperation::Current: @@ -118,7 +124,7 @@ static int GetOperation(HistoryOperation op) { case HistoryOperation::Newer: return H_PREV; case HistoryOperation::Newest: - return H_LAST; + return H_FIRST; } llvm_unreachable("Fully covered switch!"); } diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp index f37bc1d235897..43cc4c3cd87bf 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp @@ -11,7 +11,7 @@ #include "ABISysV_arm64.h" #include "lldb/Core/PluginManager.h" -LLDB_PLUGIN(ABIAArch64) +LLDB_PLUGIN_DEFINE(ABIAArch64) void ABIAArch64::Initialize() { ABISysV_arm64::Initialize(); diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp index 7726c1b891de0..1690f1c511f2a 100644 --- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp +++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp @@ -55,7 +55,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_arc) +LLDB_PLUGIN_DEFINE(ABISysV_arc) namespace { namespace dwarf { diff --git a/lldb/source/Plugins/ABI/ARM/ABIARM.cpp b/lldb/source/Plugins/ABI/ARM/ABIARM.cpp index 790cb877b91e2..882c14d386e31 100644 --- a/lldb/source/Plugins/ABI/ARM/ABIARM.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABIARM.cpp @@ -11,7 +11,7 @@ #include "ABISysV_arm.h" #include "lldb/Core/PluginManager.h" -LLDB_PLUGIN(ABIARM) +LLDB_PLUGIN_DEFINE(ABIARM) void ABIARM::Initialize() { ABISysV_arm::Initialize(); diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp index 4d38b9165728f..73d8308ae0dc4 100644 --- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABIMacOSX_arm) +LLDB_PLUGIN_DEFINE(ABIMacOSX_arm) static RegisterInfo g_register_infos[] = { // NAME ALT SZ OFF ENCODING FORMAT EH_FRAME diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp index 8d7867827f602..1a93bac564f72 100644 --- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_arm) +LLDB_PLUGIN_DEFINE(ABISysV_arm) static RegisterInfo g_register_infos[] = { // NAME ALT SZ OFF ENCODING FORMAT EH_FRAME diff --git a/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp b/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp index 65407bfe2543a..601d9c2f0f052 100644 --- a/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp +++ b/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_hexagon) +LLDB_PLUGIN_DEFINE(ABISysV_hexagon) static RegisterInfo g_register_infos[] = { // hexagon-core.xml diff --git a/lldb/source/Plugins/ABI/Mips/ABIMips.cpp b/lldb/source/Plugins/ABI/Mips/ABIMips.cpp index 08e694a659b41..16ef1faf9d9d6 100644 --- a/lldb/source/Plugins/ABI/Mips/ABIMips.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABIMips.cpp @@ -11,7 +11,7 @@ #include "ABISysV_mips64.h" #include "lldb/Core/PluginManager.h" -LLDB_PLUGIN(ABIMips) +LLDB_PLUGIN_DEFINE(ABIMips) void ABIMips::Initialize() { ABISysV_mips::Initialize(); diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp index 401646a334f1d..d66e0926ad99e 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_mips) +LLDB_PLUGIN_DEFINE(ABISysV_mips) enum dwarf_regnums { dwarf_r0 = 0, diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp index ea42f0c8fe17c..bb28a50e5f4ab 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_mips64) +LLDB_PLUGIN_DEFINE(ABISysV_mips64) enum dwarf_regnums { dwarf_r0 = 0, diff --git a/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp b/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp index b1591dba6a1bb..b561e3c93f571 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp @@ -11,7 +11,7 @@ #include "ABISysV_ppc64.h" #include "lldb/Core/PluginManager.h" -LLDB_PLUGIN(ABIPowerPC) +LLDB_PLUGIN_DEFINE(ABIPowerPC) void ABIPowerPC::Initialize() { ABISysV_ppc::Initialize(); diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp index c4d90a69a0320..6f5eded7b0315 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_ppc) +LLDB_PLUGIN_DEFINE(ABISysV_ppc) enum dwarf_regnums { dwarf_r0 = 0, diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp index dba347d3ceafc..251ac972fd768 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp @@ -47,7 +47,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_ppc64) +LLDB_PLUGIN_DEFINE(ABISysV_ppc64) const lldb_private::RegisterInfo * ABISysV_ppc64::GetRegisterInfoArray(uint32_t &count) { diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp index c01e088cd7996..bfeaa1226df26 100644 --- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp +++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_s390x) +LLDB_PLUGIN_DEFINE(ABISysV_s390x) enum dwarf_regnums { // General Purpose Registers diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp index 610baa2ca0869..d11c1af1d2599 100644 --- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp @@ -29,7 +29,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABIMacOSX_i386) +LLDB_PLUGIN_DEFINE(ABIMacOSX_i386) enum { ehframe_eax = 0, diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp index a89e0baad1a2d..8fc22b21623cd 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp @@ -31,7 +31,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_i386) +LLDB_PLUGIN_DEFINE(ABISysV_i386) // This source file uses the following document as a reference: //==================================================================== diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp index 571b796652eec..01671190e106f 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp @@ -35,7 +35,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABISysV_x86_64) +LLDB_PLUGIN_DEFINE(ABISysV_x86_64) enum dwarf_regnums { dwarf_rax = 0, diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp index 6a7c98323037f..37b1aedcd463c 100644 --- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp @@ -33,7 +33,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ABIWindows_x86_64) +LLDB_PLUGIN_DEFINE(ABIWindows_x86_64) enum dwarf_regnums { dwarf_rax = 0, diff --git a/lldb/source/Plugins/ABI/X86/ABIX86.cpp b/lldb/source/Plugins/ABI/X86/ABIX86.cpp index 207d0b289d67f..714bf25f482ce 100644 --- a/lldb/source/Plugins/ABI/X86/ABIX86.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIX86.cpp @@ -13,7 +13,7 @@ #include "ABIWindows_x86_64.h" #include "lldb/Core/PluginManager.h" -LLDB_PLUGIN(ABIX86) +LLDB_PLUGIN_DEFINE(ABIX86) void ABIX86::Initialize() { ABIMacOSX_i386::Initialize(); diff --git a/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp b/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp index 7fb9281fb7875..58c7cbb4530ad 100644 --- a/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp +++ b/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp @@ -17,7 +17,7 @@ using namespace lldb_private; using namespace lldb; -LLDB_PLUGIN(ArchitectureArm) +LLDB_PLUGIN_DEFINE(ArchitectureArm) ConstString ArchitectureArm::GetPluginNameStatic() { return ConstString("arm"); diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp index e8240ce0d725b..f426ac63e4b53 100644 --- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp +++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp @@ -21,7 +21,7 @@ using namespace lldb_private; using namespace lldb; -LLDB_PLUGIN(ArchitectureMips) +LLDB_PLUGIN_DEFINE(ArchitectureMips) ConstString ArchitectureMips::GetPluginNameStatic() { return ConstString("mips"); diff --git a/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp b/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp index 83d6832381e73..94301ecf052c1 100644 --- a/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp +++ b/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp @@ -20,7 +20,7 @@ using namespace lldb_private; using namespace lldb; -LLDB_PLUGIN(ArchitecturePPC64) +LLDB_PLUGIN_DEFINE(ArchitecturePPC64) ConstString ArchitecturePPC64::GetPluginNameStatic() { return ConstString("ppc64"); diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp index 139bda59a60c0..6427d8d176c86 100644 --- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp +++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp @@ -43,7 +43,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DisassemblerLLVMC) +LLDB_PLUGIN_DEFINE(DisassemblerLLVMC) class DisassemblerLLVMC::MCDisasmInstance { public: diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index 26f85906e3173..193b3bd829c54 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -44,7 +44,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DynamicLoaderDarwinKernel) +LLDB_PLUGIN_DEFINE(DynamicLoaderDarwinKernel) // Progressively greater amounts of scanning we will allow For some targets // very early in startup, we can't do any random reads of memory or we can diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp index 9cb6d1fcb612d..a6db648baa1af 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp @@ -47,7 +47,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DynamicLoaderMacOSXDYLD) +LLDB_PLUGIN_DEFINE(DynamicLoaderMacOSXDYLD) // Create an instance of this class. This function is filled into the plugin // info class that gets handed out by the plugin factory and allows the lldb to diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp index eed8a487d258b..c572c3024f9ce 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp @@ -29,7 +29,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DynamicLoaderPOSIXDYLD) +LLDB_PLUGIN_DEFINE(DynamicLoaderPOSIXDYLD) void DynamicLoaderPOSIXDYLD::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), diff --git a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp index 651d233cd025b..13aad5f4ccb66 100644 --- a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp +++ b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp @@ -17,7 +17,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DynamicLoaderStatic) +LLDB_PLUGIN_DEFINE(DynamicLoaderStatic) // Create an instance of this class. This function is filled into the plugin // info class that gets handed out by the plugin factory and allows the lldb to diff --git a/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp b/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp index 442eae7d8d09a..e4eceb2bd63c8 100644 --- a/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp @@ -23,7 +23,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(DynamicLoaderWindowsDYLD) +LLDB_PLUGIN_DEFINE(DynamicLoaderWindowsDYLD) DynamicLoaderWindowsDYLD::DynamicLoaderWindowsDYLD(Process *process) : DynamicLoader(process) {} diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp index 62d69953fe765..e87bc1f75f5ce 100644 --- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp +++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp @@ -30,7 +30,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(EmulateInstructionARM) +LLDB_PLUGIN_DEFINE(EmulateInstructionARM) // Convenient macro definitions. #define APSR_C Bit32(m_opcode_cpsr, CPSR_C_POS) diff --git a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp index 01cd03de60ab9..144d383732470 100644 --- a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp +++ b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp @@ -47,7 +47,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(EmulateInstructionARM64) +LLDB_PLUGIN_DEFINE(EmulateInstructionARM64) static bool LLDBTableGetRegisterInfo(uint32_t reg_num, RegisterInfo ®_info) { if (reg_num >= llvm::array_lengthof(g_register_infos_arm64_le)) diff --git a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp index f4a947599f518..ae74c89c4f2eb 100644 --- a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp +++ b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp @@ -40,7 +40,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(EmulateInstructionMIPS) +LLDB_PLUGIN_DEFINE(EmulateInstructionMIPS) #define UInt(x) ((uint64_t)x) #define integer int64_t diff --git a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp index 711e6d594eb63..9a578ab408f74 100644 --- a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp +++ b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp @@ -40,7 +40,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(EmulateInstructionMIPS64) +LLDB_PLUGIN_DEFINE(EmulateInstructionMIPS64) #define UInt(x) ((uint64_t)x) #define integer int64_t diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp index 52175ef5f4ae7..2588c935dd6b7 100644 --- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp +++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp @@ -25,7 +25,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(EmulateInstructionPPC64) +LLDB_PLUGIN_DEFINE(EmulateInstructionPPC64) EmulateInstructionPPC64::EmulateInstructionPPC64(const ArchSpec &arch) : EmulateInstruction(arch) {} diff --git a/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp b/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp index dc23b604722d6..e78ea3a684836 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp @@ -30,7 +30,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(InstrumentationRuntimeASan) +LLDB_PLUGIN_DEFINE(InstrumentationRuntimeASan) lldb::InstrumentationRuntimeSP InstrumentationRuntimeASan::CreateInstance(const lldb::ProcessSP &process_sp) { diff --git a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp index 91c411a4f013a..72d28c3474576 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp @@ -29,7 +29,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(InstrumentationRuntimeMainThreadChecker) +LLDB_PLUGIN_DEFINE(InstrumentationRuntimeMainThreadChecker) InstrumentationRuntimeMainThreadChecker:: ~InstrumentationRuntimeMainThreadChecker() { diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp index 4229626077345..f4c116e7576c4 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp @@ -35,7 +35,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(InstrumentationRuntimeTSan) +LLDB_PLUGIN_DEFINE(InstrumentationRuntimeTSan) lldb::InstrumentationRuntimeSP InstrumentationRuntimeTSan::CreateInstance(const lldb::ProcessSP &process_sp) { diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp index b13eac6081462..b60eb53f3d4a7 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp @@ -36,7 +36,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(InstrumentationRuntimeUBSan) +LLDB_PLUGIN_DEFINE(InstrumentationRuntimeUBSan) InstrumentationRuntimeUBSan::~InstrumentationRuntimeUBSan() { Deactivate(); } diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp index ad089ad0d2295..df9f700a7f185 100644 --- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp +++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp @@ -32,7 +32,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(JITLoaderGDB) +LLDB_PLUGIN_DEFINE(JITLoaderGDB) // Debug Interface Structures enum jit_actions_t { JIT_NOACTION = 0, JIT_REGISTER_FN, JIT_UNREGISTER_FN }; diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index ea2c0104cf2a1..97084da5fffad 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -43,7 +43,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::formatters; -LLDB_PLUGIN(CPlusPlusLanguage) +LLDB_PLUGIN_DEFINE(CPlusPlusLanguage) void CPlusPlusLanguage::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), "C++ Language", diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp index 82fe9b39b81f2..6b2a5f845d734 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp @@ -37,7 +37,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::formatters; -LLDB_PLUGIN(ObjCLanguage) +LLDB_PLUGIN_DEFINE(ObjCLanguage) void ObjCLanguage::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), "Objective-C Language", diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp index 207cec1a01f91..0a4017eda434c 100644 --- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp @@ -14,7 +14,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ObjCPlusPlusLanguage) +LLDB_PLUGIN_DEFINE(ObjCPlusPlusLanguage) bool ObjCPlusPlusLanguage::IsSourceFile(llvm::StringRef file_path) const { const auto suffixes = {".h", ".mm"}; diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp index 35418e0c2ffdc..e08f0f070f6c0 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp @@ -40,7 +40,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ItaniumABILanguageRuntime) +LLDB_PLUGIN_DEFINE(ItaniumABILanguageRuntime) static const char *vtable_demangled_prefix = "vtable for "; diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp index 3d58f41235eaa..cca6911485a04 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp @@ -44,7 +44,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(AppleObjCRuntime) +LLDB_PLUGIN_DEFINE(AppleObjCRuntime) char AppleObjCRuntime::ID = 0; diff --git a/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp index 9b81ba03148cb..f2b95028f807a 100644 --- a/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp @@ -46,7 +46,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_renderscript; -LLDB_PLUGIN(RenderScriptRuntime) +LLDB_PLUGIN_DEFINE(RenderScriptRuntime) #define FMT_COORD "(%" PRIu32 ", %" PRIu32 ", %" PRIu32 ")" diff --git a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp index 0c8250c5de895..4b9da8f76fd24 100644 --- a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp +++ b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp @@ -28,7 +28,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(MemoryHistoryASan) +LLDB_PLUGIN_DEFINE(MemoryHistoryASan) MemoryHistorySP MemoryHistoryASan::CreateInstance(const ProcessSP &process_sp) { if (!process_sp.get()) diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp index 3d4885379e86d..47c7ae8c8d639 100644 --- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp +++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp @@ -40,7 +40,7 @@ typedef struct ar_hdr { using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ObjectContainerBSDArchive) +LLDB_PLUGIN_DEFINE(ObjectContainerBSDArchive) ObjectContainerBSDArchive::Object::Object() : ar_name(), modification_time(0), uid(0), gid(0), mode(0), size(0), diff --git a/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp b/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp index ef763addede4f..bc30e57d1d0cd 100644 --- a/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp +++ b/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp @@ -20,7 +20,7 @@ using namespace lldb; using namespace lldb_private; using namespace llvm::MachO; -LLDB_PLUGIN(ObjectContainerUniversalMachO) +LLDB_PLUGIN_DEFINE(ObjectContainerUniversalMachO) void ObjectContainerUniversalMachO::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp index f36305be5960e..7a9163ddb8801 100644 --- a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp +++ b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp @@ -16,7 +16,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::breakpad; -LLDB_PLUGIN(ObjectFileBreakpad) +LLDB_PLUGIN_DEFINE(ObjectFileBreakpad) namespace { struct Header { diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index b97a326fde3ed..a328e16e4bde5 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -51,7 +51,7 @@ using namespace lldb_private; using namespace elf; using namespace llvm::ELF; -LLDB_PLUGIN(ObjectFileELF) +LLDB_PLUGIN_DEFINE(ObjectFileELF) namespace { diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index ce7d293f205d7..afa9b645cbecc 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -66,7 +66,7 @@ using namespace lldb; using namespace lldb_private; using namespace llvm::MachO; -LLDB_PLUGIN(ObjectFileMachO) +LLDB_PLUGIN_DEFINE(ObjectFileMachO) // Some structure definitions needed for parsing the dyld shared cache files // found on iOS devices. diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index ec11fdd0e3750..38b4472f50a75 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -41,7 +41,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ObjectFilePECOFF) +LLDB_PLUGIN_DEFINE(ObjectFilePECOFF) struct CVInfoPdb70 { // 16-byte GUID diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp index 270626061737f..b9561bdff9f3e 100644 --- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp +++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp @@ -28,7 +28,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::wasm; -LLDB_PLUGIN(ObjectFileWasm) +LLDB_PLUGIN_DEFINE(ObjectFileWasm) static const uint32_t kWasmHeaderSize = sizeof(llvm::wasm::WasmMagic) + sizeof(llvm::wasm::WasmVersion); diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp index da65e9f54a73f..417aa2e21436f 100644 --- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp +++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp @@ -39,7 +39,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(OperatingSystemPython) +LLDB_PLUGIN_DEFINE(OperatingSystemPython) void OperatingSystemPython::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp index 6ce0858787a1e..9949fbf18fa33 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp @@ -26,7 +26,7 @@ using namespace lldb_private; using namespace lldb_private::platform_android; using namespace std::chrono; -LLDB_PLUGIN(PlatformAndroid) +LLDB_PLUGIN_DEFINE(PlatformAndroid) static uint32_t g_initialize_count = 0; static const unsigned int g_android_default_cache_size = diff --git a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp index e7a3ea2e6ec7c..97c2f22b505f5 100644 --- a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp +++ b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp @@ -36,7 +36,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_freebsd; -LLDB_PLUGIN(PlatformFreeBSD) +LLDB_PLUGIN_DEFINE(PlatformFreeBSD) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp index 66c3529253817..cea87c4d90ad1 100644 --- a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp +++ b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_linux; -LLDB_PLUGIN(PlatformLinux) +LLDB_PLUGIN_DEFINE(PlatformLinux) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp index c62940f35e5c7..38de91a30cf65 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp @@ -39,7 +39,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(PlatformMacOSX) +LLDB_PLUGIN_DEFINE(PlatformMacOSX) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp index 567c64ca5519f..b37cdecd38c4d 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp @@ -25,7 +25,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(PlatformRemoteiOS) +LLDB_PLUGIN_DEFINE(PlatformRemoteiOS) // Static Variables static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp index 1c7980151f61b..9942c339650ee 100644 --- a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp +++ b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_netbsd; -LLDB_PLUGIN(PlatformNetBSD) +LLDB_PLUGIN_DEFINE(PlatformNetBSD) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp b/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp index 36f5e1692db28..a743970990a64 100644 --- a/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp +++ b/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_openbsd; -LLDB_PLUGIN(PlatformOpenBSD) +LLDB_PLUGIN_DEFINE(PlatformOpenBSD) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp index 01250b34ddc98..7983c1d461b6c 100644 --- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp +++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp @@ -27,7 +27,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(PlatformWindows) +LLDB_PLUGIN_DEFINE(PlatformWindows) static uint32_t g_initialize_count = 0; diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp index b581c0783cf85..550b68090e7a8 100644 --- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp +++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp @@ -35,7 +35,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_gdb_server; -LLDB_PLUGIN(PlatformRemoteGDBServer) +LLDB_PLUGIN_DEFINE(PlatformRemoteGDBServer) static bool g_initialized = false; diff --git a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp index a11959aa5a2f6..7228ec987ad1e 100644 --- a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp @@ -56,7 +56,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ProcessFreeBSD) +LLDB_PLUGIN_DEFINE(ProcessFreeBSD) namespace { UnixSignalsSP &GetFreeBSDSignals() { diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index c0f31d76d018c..e78912e3cd305 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -50,7 +50,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ProcessKDP) +LLDB_PLUGIN_DEFINE(ProcessKDP) namespace { diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index a35e6c08c63da..286a95fa58947 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -44,7 +44,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ProcessWindows) +LLDB_PLUGIN_DEFINE(ProcessWindows) namespace { std::string GetProcessExecutableName(HANDLE process_handle) { diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp index 4056800c9de3f..aa95e92607ad7 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp @@ -35,7 +35,7 @@ using namespace lldb_private; namespace ELF = llvm::ELF; -LLDB_PLUGIN(ProcessElfCore) +LLDB_PLUGIN_DEFINE(ProcessElfCore) ConstString ProcessElfCore::GetPluginNameStatic() { static ConstString g_name("elf-core"); diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3296560658229..c7fc0161d53aa 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -90,7 +90,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::process_gdb_remote; -LLDB_PLUGIN(ProcessGDBRemote) +LLDB_PLUGIN_DEFINE(ProcessGDBRemote) namespace lldb { // Provide a function that can easily dump the packet history if we know a diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp index 3e6ba49bf0dd8..1628dc545c9b7 100644 --- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp +++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp @@ -44,7 +44,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ProcessMachCore) +LLDB_PLUGIN_DEFINE(ProcessMachCore) ConstString ProcessMachCore::GetPluginNameStatic() { static ConstString g_name("mach-o-core"); diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index dcbeb3bf81716..0ce3b580c1f5c 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -41,7 +41,7 @@ using namespace lldb; using namespace lldb_private; using namespace minidump; -LLDB_PLUGIN(ProcessMinidump) +LLDB_PLUGIN_DEFINE(ProcessMinidump) namespace { diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 629f209b91fd8..ecbd30c10ae01 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -19,7 +19,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ScriptInterpreterLua) +LLDB_PLUGIN_DEFINE(ScriptInterpreterLua) class IOHandlerLuaInterpreter : public IOHandlerDelegate, public IOHandlerEditline { diff --git a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp index 23ff6b159633a..d9c32cc132d4c 100644 --- a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp @@ -20,7 +20,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(ScriptInterpreterNone) +LLDB_PLUGIN_DEFINE(ScriptInterpreterNone) ScriptInterpreterNone::ScriptInterpreterNone(Debugger &debugger) : ScriptInterpreter(debugger, eScriptLanguageNone) {} diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index cc03627de901b..722af713ba437 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -56,7 +56,7 @@ using namespace lldb_private; using namespace lldb_private::python; using llvm::Expected; -LLDB_PLUGIN(ScriptInterpreterPython) +LLDB_PLUGIN_DEFINE(ScriptInterpreterPython) // Defined in the SWIG source file #if PY_MAJOR_VERSION >= 3 diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp index 8f1a2b57bc611..e61d9630656dd 100644 --- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp +++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp @@ -36,7 +36,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(StructuredDataDarwinLog) +LLDB_PLUGIN_DEFINE(StructuredDataDarwinLog) #pragma mark - #pragma mark Anonymous Namespace diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp index d97ad7cbe451e..fcefb2e059b2b 100644 --- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp +++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp @@ -25,7 +25,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::breakpad; -LLDB_PLUGIN(SymbolFileBreakpad) +LLDB_PLUGIN_DEFINE(SymbolFileBreakpad) char SymbolFileBreakpad::ID; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index b45d84870ffbf..2ed050cc193f9 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -94,7 +94,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(SymbolFileDWARF) +LLDB_PLUGIN_DEFINE(SymbolFileDWARF) char SymbolFileDWARF::ID; diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp index 51459a99576de..75f2eb1594214 100644 --- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp @@ -58,7 +58,7 @@ using namespace lldb; using namespace lldb_private; using namespace llvm::pdb; -LLDB_PLUGIN(SymbolFilePDB) +LLDB_PLUGIN_DEFINE(SymbolFilePDB) char SymbolFilePDB::ID; diff --git a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp index 42b843694a6f8..c4a0e609aa22e 100644 --- a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp +++ b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp @@ -25,7 +25,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(SymbolFileSymtab) +LLDB_PLUGIN_DEFINE(SymbolFileSymtab) char SymbolFileSymtab::ID; diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp index 21242a0c8b950..2e6fd43650212 100644 --- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp +++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp @@ -25,7 +25,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(SymbolVendorELF) +LLDB_PLUGIN_DEFINE(SymbolVendorELF) // SymbolVendorELF constructor SymbolVendorELF::SymbolVendorELF(const lldb::ModuleSP &module_sp) diff --git a/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp b/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp index 71a1025bef259..2b67fee706178 100644 --- a/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp +++ b/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp @@ -26,7 +26,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(SymbolVendorMacOSX) +LLDB_PLUGIN_DEFINE(SymbolVendorMacOSX) // SymbolVendorMacOSX constructor SymbolVendorMacOSX::SymbolVendorMacOSX(const lldb::ModuleSP &module_sp) diff --git a/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp b/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp index 64dd956fd35fd..1c09dabc5622f 100644 --- a/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp +++ b/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp @@ -26,7 +26,7 @@ using namespace lldb; using namespace lldb_private; using namespace lldb_private::wasm; -LLDB_PLUGIN(SymbolVendorWasm) +LLDB_PLUGIN_DEFINE(SymbolVendorWasm) // SymbolVendorWasm constructor SymbolVendorWasm::SymbolVendorWasm(const lldb::ModuleSP &module_sp) diff --git a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp index 7ac70331267e0..25db5fe892fb6 100644 --- a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp +++ b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp @@ -34,7 +34,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(SystemRuntimeMacOSX) +LLDB_PLUGIN_DEFINE(SystemRuntimeMacOSX) // Create an instance of this class. This function is filled into the plugin // info class that gets handed out by the plugin factory and allows the lldb to diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index cbe0301fe162e..2fa5dc38eb8eb 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -81,7 +81,7 @@ using namespace lldb_private; using namespace clang; using llvm::StringSwitch; -LLDB_PLUGIN(TypeSystemClang) +LLDB_PLUGIN_DEFINE(TypeSystemClang) namespace { #ifdef LLDB_CONFIGURATION_DEBUG diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp index 01ef34204e5e9..ba7544fb52dd9 100644 --- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp +++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp @@ -28,7 +28,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(UnwindAssemblyInstEmulation) +LLDB_PLUGIN_DEFINE(UnwindAssemblyInstEmulation) // UnwindAssemblyInstEmulation method definitions diff --git a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp index c49ca465b0a9f..430ba09b811c4 100644 --- a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp +++ b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp @@ -30,7 +30,7 @@ using namespace lldb; using namespace lldb_private; -LLDB_PLUGIN(UnwindAssembly_x86) +LLDB_PLUGIN_DEFINE(UnwindAssembly_x86) // UnwindAssemblyParser_x86 method definitions diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index cd48f93f80ca2..1c75c8ea35be8 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -793,6 +793,10 @@ if(NOT CYGWIN AND NOT WIN32) endif() add_flag_if_supported("-fdata-sections" FDATA_SECTIONS) endif() +elseif(MSVC) + if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) + append("/Gw" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + endif() endif() if(MSVC) diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst index 5bf9f37384ab2..48d561349108c 100644 --- a/llvm/docs/GettingStarted.rst +++ b/llvm/docs/GettingStarted.rst @@ -46,7 +46,7 @@ This is an example workflow and configuration to get and build the LLVM source: * ``cd build`` * ``cmake -G [options] ../llvm`` - Some common generators are: + Some common build system generators are: * ``Ninja`` --- for generating `Ninja `_ build files. Most llvm developers use Ninja. @@ -75,9 +75,11 @@ This is an example workflow and configuration to get and build the LLVM source: * ``-DLLVM_ENABLE_ASSERTIONS=On`` --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types). - * Run your build tool of choice! + * ``cmake --build . [--target ]`` or the build system specified + above directly. - * The default target (i.e. ``ninja`` or ``make``) will build all of LLVM. + * The default target (i.e. ``cmake --build .`` or ``make``) will build all of + LLVM. * The ``check-all`` target (i.e. ``ninja check-all``) will run the regression tests to ensure everything is in working order. @@ -85,10 +87,10 @@ This is an example workflow and configuration to get and build the LLVM source: * CMake will generate build targets for each tool and library, and most LLVM sub-projects generate their own ``check-`` target. - * Running a serial build will be *slow*. To improve speed, try running a - parallel build. That's done by default in Ninja; for ``make``, use - ``make -j NNN`` (NNN is the number of parallel jobs, use e.g. number of - CPUs you have.) + * Running a serial build will be **slow**. To improve speed, try running a + parallel build. That's done by default in Ninja; for ``make``, use the + option ``-j NN``, where ``NN`` is the number of parallel jobs, e.g. the + number of available CPUs. * For more information see `CMake `__ diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp index 3828e0a5f82b3..0c986d5a3f1c5 100644 --- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp +++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp @@ -131,7 +131,7 @@ class SpeculativeJIT { std::unique_ptr ES; DataLayout DL; MangleAndInterner Mangle{*ES, DL}; - ThreadPool CompileThreads{NumThreads}; + ThreadPool CompileThreads{llvm::hardware_concurrency(NumThreads)}; JITDylib &MainJD; diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp index 394c1308fd6fa..b920bee6ad14b 100644 --- a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp @@ -262,7 +262,8 @@ void ThinLtoJIT::setupLayers(JITTargetMachineBuilder JTMB, OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule); // Delegate compilation to the thread pool. - CompileThreads = std::make_unique(NumCompileThreads); + CompileThreads = std::make_unique( + llvm::hardware_concurrency(NumCompileThreads)); ES.setDispatchMaterialization( [this](JITDylib &JD, std::unique_ptr MU) { if (IsTrivialModule(MU.get())) { diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h index a6574be5c3973..29a24a0c5e147 100644 --- a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h +++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h @@ -26,7 +26,7 @@ class ThinLtoModuleIndex { public: ThinLtoModuleIndex(ExecutionSession &ES, unsigned ParseModuleThreads) : ES(ES), CombinedSummaryIndex(HaveGVs), - ParseModuleWorkers(ParseModuleThreads), + ParseModuleWorkers(llvm::hardware_concurrency(ParseModuleThreads)), NumParseModuleThreads(ParseModuleThreads) {} Error add(StringRef InputPath); diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 5284be8c4a027..02e01effc0fc9 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -14,6 +14,7 @@ #define LLVM_ADT_BITVECTOR_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/MathExtras.h" #include @@ -719,6 +720,14 @@ class BitVector { if (this == &RHS) return *this; Size = RHS.size(); + + // Handle tombstone when the BitVector is a key of a DenseHash. + if (RHS.isInvalid()) { + std::free(Bits.data()); + Bits = None; + return *this; + } + unsigned RHSWords = NumBitWords(Size); if (Size <= getBitCapacity()) { if (Size) @@ -758,6 +767,14 @@ class BitVector { std::swap(Size, RHS.Size); } + void invalid() { + assert(!Size && Bits.empty()); + Size = (unsigned)-1; + } + bool isInvalid() const { return Size == (unsigned)-1; } + + ArrayRef getData() const { return Bits; } + //===--------------------------------------------------------------------===// // Portable bit mask operations. //===--------------------------------------------------------------------===// @@ -932,6 +949,23 @@ inline size_t capacity_in_bytes(const BitVector &X) { return X.getMemorySize(); } +template <> struct DenseMapInfo { + static inline BitVector getEmptyKey() { return BitVector(); } + static inline BitVector getTombstoneKey() { + BitVector V; + V.invalid(); + return V; + } + static unsigned getHashValue(const BitVector &V) { + return DenseMapInfo>>::getHashValue( + std::make_pair(V.size(), V.getData())); + } + static bool isEqual(const BitVector &LHS, const BitVector &RHS) { + if (LHS.isInvalid() || RHS.isInvalid()) + return LHS.isInvalid() == RHS.isInvalid(); + return LHS == RHS; + } +}; } // end namespace llvm namespace std { diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h index 61375c0080220..b7367d68bdae8 100644 --- a/llvm/include/llvm/ADT/SmallBitVector.h +++ b/llvm/include/llvm/ADT/SmallBitVector.h @@ -662,6 +662,16 @@ class SmallBitVector { getPointer()->clearBitsNotInMask(Mask, MaskWords); } + void invalid() { + assert(empty()); + X = (uintptr_t)-1; + } + bool isInvalid() const { return X == (uintptr_t)-1; } + + ArrayRef getData() const { + return isSmall() ? makeArrayRef(X) : getPointer()->getData(); + } + private: template void applyMask(const uint32_t *Mask, unsigned MaskWords) { @@ -699,6 +709,23 @@ operator^(const SmallBitVector &LHS, const SmallBitVector &RHS) { return Result; } +template <> struct DenseMapInfo { + static inline SmallBitVector getEmptyKey() { return SmallBitVector(); } + static inline SmallBitVector getTombstoneKey() { + SmallBitVector V; + V.invalid(); + return V; + } + static unsigned getHashValue(const SmallBitVector &V) { + return DenseMapInfo>>::getHashValue( + std::make_pair(V.size(), V.getData())); + } + static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) { + if (LHS.isInvalid() || RHS.isInvalid()) + return LHS.isInvalid() == RHS.isInvalid(); + return LHS == RHS; + } +}; } // end namespace llvm namespace std { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0bdb49edc9830..d3f8896eca162 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1471,6 +1471,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { SingleCallCost = TargetTransformInfo::TCC_Expensive; break; // FIXME: ctlz, cttz, ... + case Intrinsic::bswap: + ISDs.push_back(ISD::BSWAP); + break; + case Intrinsic::bitreverse: + ISDs.push_back(ISD::BITREVERSE); + break; } const TargetLoweringBase *TLI = getTLI(); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index b4f9b96653c59..af8129b98a2b8 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -280,9 +280,36 @@ class LegalizationArtifactCombiner { } } + /// Try to replace DstReg with SrcReg or build a COPY instruction + /// depending on the register constraints. + static void replaceRegOrBuildCopy(Register DstReg, Register SrcReg, + MachineRegisterInfo &MRI, + MachineIRBuilder &Builder, + SmallVectorImpl &UpdatedDefs, + GISelObserverWrapper &Observer) { + if (!llvm::canReplaceReg(DstReg, SrcReg, MRI)) { + Builder.buildCopy(DstReg, SrcReg); + UpdatedDefs.push_back(DstReg); + return; + } + SmallVector UseMIs; + // Get the users and notify the observer before replacing. + for (auto &UseMI : MRI.use_instructions(DstReg)) { + UseMIs.push_back(&UseMI); + Observer.changingInstr(UseMI); + } + // Replace the registers. + MRI.replaceRegWith(DstReg, SrcReg); + UpdatedDefs.push_back(SrcReg); + // Notify the observer that we changed the instructions. + for (auto *UseMI : UseMIs) + Observer.changedInstr(*UseMI); + } + bool tryCombineMerges(MachineInstr &MI, SmallVectorImpl &DeadInsts, - SmallVectorImpl &UpdatedDefs) { + SmallVectorImpl &UpdatedDefs, + GISelObserverWrapper &Observer) { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); unsigned NumDefs = MI.getNumOperands() - 1; @@ -395,10 +422,12 @@ class LegalizationArtifactCombiner { "Bitcast and the other kinds of conversions should " "have happened earlier"); + Builder.setInstr(MI); for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { - Register NewDef = MergeI->getOperand(Idx + 1).getReg(); - MRI.replaceRegWith(MI.getOperand(Idx).getReg(), NewDef); - UpdatedDefs.push_back(NewDef); + Register DstReg = MI.getOperand(Idx).getReg(); + Register SrcReg = MergeI->getOperand(Idx + 1).getReg(); + replaceRegOrBuildCopy(DstReg, SrcReg, MRI, Builder, UpdatedDefs, + Observer); } } @@ -498,7 +527,7 @@ class LegalizationArtifactCombiner { Changed = tryCombineSExt(MI, DeadInsts, UpdatedDefs); break; case TargetOpcode::G_UNMERGE_VALUES: - Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs); + Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs, WrapperObserver); break; case TargetOpcode::G_EXTRACT: Changed = tryCombineExtract(MI, DeadInsts, UpdatedDefs); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 0ef9a713f784e..6f35718902518 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -266,6 +266,10 @@ class LegalizerHelper { LegalizeResult lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerFPTOSI(MachineInstr &MI); + + LegalizeResult lowerFPTRUNC_F64_TO_F16(MachineInstr &MI); + LegalizeResult lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 63c5746bf183f..a88a97c666ad5 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -93,6 +93,11 @@ bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI); + +/// Check if DstReg can be replaced with SrcReg depending on the register +/// constraints. +bool canReplaceReg(Register DstReg, Register SrcReg, MachineRegisterInfo &MRI); + /// Check whether an instruction \p MI is dead: it only defines dead virtual /// registers, and doesn't have other side effects. bool isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h index 3b401b72a7d8a..960c557f55d40 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h @@ -49,6 +49,9 @@ enum class TraitProperty { /// Parse \p Str and return the trait set it matches or TraitSet::invalid. TraitSet getOpenMPContextTraitSetKind(StringRef Str); +/// Return the trait set for which \p Selector is a selector. +TraitSet getOpenMPContextTraitSetForSelector(TraitSelector Selector); + /// Return the trait set for which \p Property is a property. TraitSet getOpenMPContextTraitSetForProperty(TraitProperty Property); @@ -67,9 +70,7 @@ StringRef getOpenMPContextTraitSelectorName(TraitSelector Kind); /// Parse \p Str and return the trait set it matches or /// TraitProperty::invalid. -TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, - TraitSelector Selector, - StringRef Str); +TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, StringRef Str); /// Return the trait property for a singleton selector \p Selector. TraitProperty getOpenMPContextTraitPropertyForSelector(TraitSelector Selector); @@ -80,6 +81,16 @@ StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind); /// Return a textual representation of the trait property \p Kind with selector /// and set name included. StringRef getOpenMPContextTraitPropertyFullName(TraitProperty Kind); + +/// Return a string listing all trait sets. +std::string listOpenMPContextTraitSets(); + +/// Return a string listing all trait selectors for \p Set. +std::string listOpenMPContextTraitSelectors(TraitSet Set); + +/// Return a string listing all trait properties for \p Set and \p Selector. +std::string listOpenMPContextTraitProperties(TraitSet Set, + TraitSelector Selector); ///} /// Return true if \p Selector can be nested in \p Set. Also sets diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index e864e05df3407..df28533456827 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -227,7 +227,8 @@ using ThinBackend = std::function( AddStreamFn AddStream, NativeObjectCache Cache)>; /// This ThinBackend runs the individual backend jobs in-process. -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel); +/// The default value means to use one job per hardware core (not hyper-thread). +ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h index 4bcbaa3142fd4..2036f46c6d561 100644 --- a/llvm/include/llvm/Support/ThreadPool.h +++ b/llvm/include/llvm/Support/ThreadPool.h @@ -13,7 +13,9 @@ #ifndef LLVM_SUPPORT_THREAD_POOL_H #define LLVM_SUPPORT_THREAD_POOL_H +#include "llvm/ADT/BitVector.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/thread.h" #include @@ -38,12 +40,11 @@ class ThreadPool { using TaskTy = std::function; using PackagedTaskTy = std::packaged_task; - /// Construct a pool with the number of threads found by - /// hardware_concurrency(). - ThreadPool(); - - /// Construct a pool of \p ThreadCount threads - ThreadPool(unsigned ThreadCount); + /// Construct a pool using the hardware strategy \p S for mapping hardware + /// execution resources (threads, cores, CPUs) + /// Defaults to using the maximum execution resources in the system, but + /// excluding any resources contained in the affinity mask. + ThreadPool(ThreadPoolStrategy S = hardware_concurrency()); /// Blocking destructor: the pool will wait for all the threads to complete. ~ThreadPool(); @@ -68,6 +69,8 @@ class ThreadPool { /// It is an error to try to add new tasks while blocking on this call. void wait(); + unsigned getThreadCount() const { return ThreadCount; } + private: /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. @@ -94,6 +97,8 @@ class ThreadPool { /// Signal for the destruction of the pool, asking thread to exit. bool EnableFlag; #endif + + unsigned ThreadCount; }; } diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index bacab8fa23b6d..d3d4a37e69c66 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -14,6 +14,7 @@ #ifndef LLVM_SUPPORT_THREADING_H #define LLVM_SUPPORT_THREADING_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/FunctionExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX @@ -143,20 +144,52 @@ void llvm_execute_on_thread_async( #endif } - /// Get the amount of currency to use for tasks requiring significant - /// memory or other resources. Currently based on physical cores, if - /// available for the host system, otherwise falls back to - /// thread::hardware_concurrency(). - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF - unsigned heavyweight_hardware_concurrency(); - - /// Get the number of threads that the current program can execute - /// concurrently. On some systems std::thread::hardware_concurrency() returns - /// the total number of cores, without taking affinity into consideration. - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF. - /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is - /// not available. - unsigned hardware_concurrency(); + /// This tells how a thread pool will be used + class ThreadPoolStrategy { + public: + // The default value (0) means all available threads should be used, + // excluding affinity mask. If set, this value only represents a suggested + // high bound, the runtime might choose a lower value (not higher). + unsigned ThreadsRequested = 0; + + // If SMT is active, use hyper threads. If false, there will be only one + // std::thread per core. + bool UseHyperThreads = true; + + /// Retrieves the max available threads for the current strategy. This + /// accounts for affinity masks and takes advantage of all CPU sockets. + unsigned compute_thread_count() const; + + /// Assign the current thread to an ideal hardware CPU or NUMA node. In a + /// multi-socket system, this ensures threads are assigned to all CPU + /// sockets. \p ThreadPoolNum represents a number bounded by [0, + /// compute_thread_count()). + void apply_thread_strategy(unsigned ThreadPoolNum) const; + }; + + /// Returns a thread strategy for tasks requiring significant memory or other + /// resources. To be used for workloads where hardware_concurrency() proves to + /// be less efficient. Avoid this strategy if doing lots of I/O. Currently + /// based on physical cores, if available for the host system, otherwise falls + /// back to hardware_concurrency(). Returns 1 when LLVM is configured with + /// LLVM_ENABLE_THREADS = OFF + inline ThreadPoolStrategy + heavyweight_hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.UseHyperThreads = false; + S.ThreadsRequested = ThreadCount; + return S; + } + + /// Returns a default thread strategy where all available hardware ressources + /// are to be used, except for those initially excluded by an affinity mask. + /// This function takes affinity into consideration. Returns 1 when LLVM is + /// configured with LLVM_ENABLE_THREADS=OFF. + inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.ThreadsRequested = ThreadCount; + return S; + } /// Return the current thread id, as used in various OS system calls. /// Note that not all platforms guarantee that the value returned will be @@ -184,6 +217,14 @@ void llvm_execute_on_thread_async( /// the operation succeeded or failed is returned. void get_thread_name(SmallVectorImpl &Name); + /// Returns a mask that represents on which hardware thread, core, CPU, NUMA + /// group, the calling thread can be executed. On Windows, threads cannot + /// cross CPU boundaries. + llvm::BitVector get_thread_affinity_mask(); + + /// Returns how many physical CPUs or NUMA groups the system has. + unsigned get_cpus(); + enum class ThreadPriority { Background = 0, Default = 1, diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index ce9944a5ce4be..450595cac57b4 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -902,11 +902,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad, BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) { - bool isInvariantLoad = false; - - if (LoadInst *LI = dyn_cast_or_null(QueryInst)) - isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load); - // Do a binary search to see if we already have an entry for this block in // the cache set. If so, find it. NonLocalDepInfo::iterator Entry = std::upper_bound( @@ -918,13 +913,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB) ExistingResult = &*Entry; - // Use cached result for invariant load only if there is no dependency for non - // invariant load. In this case invariant load can not have any dependency as - // well. - if (ExistingResult && isInvariantLoad && - !ExistingResult->getResult().isNonFuncLocal()) - ExistingResult = nullptr; - // If we have a cached entry, and it is non-dirty, use it as the value for // this dependency. if (ExistingResult && !ExistingResult->getResult().isDirty()) { @@ -953,10 +941,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock( MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst); - // Don't cache results for invariant load. - if (isInvariantLoad) - return Dep; - // If we had a dirty entry for the block, update it. Otherwise, just add // a new entry. if (ExistingResult) @@ -1045,10 +1029,6 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( InitialNLPI.Size = Loc.Size; InitialNLPI.AATags = Loc.AATags; - bool isInvariantLoad = false; - if (LoadInst *LI = dyn_cast_or_null(QueryInst)) - isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load); - // Get the NLPI for CacheKey, inserting one into the map if it doesn't // already have one. std::pair Pair = @@ -1057,8 +1037,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( // If we already have a cache entry for this CacheKey, we may need to do some // work to reconcile the cache entry and the current query. - // Invariant loads don't participate in caching. Thus no need to reconcile. - if (!isInvariantLoad && !Pair.second) { + if (!Pair.second) { if (CacheInfo->Size != Loc.Size) { bool ThrowOutEverything; if (CacheInfo->Size.hasValue() && Loc.Size.hasValue()) { @@ -1114,10 +1093,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( // If we have valid cached information for exactly the block we are // investigating, just return it with no recomputation. - // Don't use cached information for invariant loads since it is valid for - // non-invariant loads only. - if (!isInvariantLoad && - CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) { + if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) { // We have a fully cached result for this query then we can just return the // cached results and populate the visited set. However, we have to verify // that we don't already have conflicting results for these blocks. Check @@ -1153,18 +1129,14 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( return true; } - // Invariant loads don't affect cache in any way thus no need to update - // CacheInfo as well. - if (!isInvariantLoad) { - // Otherwise, either this is a new block, a block with an invalid cache - // pointer or one that we're about to invalidate by putting more info into - // it than its valid cache info. If empty, the result will be valid cache - // info, otherwise it isn't. - if (Cache->empty()) - CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock); - else - CacheInfo->Pair = BBSkipFirstBlockPair(); - } + // Otherwise, either this is a new block, a block with an invalid cache + // pointer or one that we're about to invalidate by putting more info into it + // than its valid cache info. If empty, the result will be valid cache info, + // otherwise it isn't. + if (Cache->empty()) + CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock); + else + CacheInfo->Pair = BBSkipFirstBlockPair(); SmallVector Worklist; Worklist.push_back(StartBB); @@ -1405,26 +1377,22 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( if (SkipFirstBlock) return false; - // Results of invariant loads are not cached thus no need to update cached - // information. - if (!isInvariantLoad) { - bool foundBlock = false; - for (NonLocalDepEntry &I : llvm::reverse(*Cache)) { - if (I.getBB() != BB) - continue; + bool foundBlock = false; + for (NonLocalDepEntry &I : llvm::reverse(*Cache)) { + if (I.getBB() != BB) + continue; - assert((GotWorklistLimit || I.getResult().isNonLocal() || - !DT.isReachableFromEntry(BB)) && - "Should only be here with transparent block"); - foundBlock = true; - I.setResult(MemDepResult::getUnknown()); - Result.push_back( - NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr())); - break; - } - (void)foundBlock; (void)GotWorklistLimit; - assert((foundBlock || GotWorklistLimit) && "Current block not in cache?"); + assert((GotWorklistLimit || I.getResult().isNonLocal() || + !DT.isReachableFromEntry(BB)) && + "Should only be here with transparent block"); + foundBlock = true; + I.setResult(MemDepResult::getUnknown()); + Result.push_back( + NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr())); + break; } + (void)foundBlock; (void)GotWorklistLimit; + assert((foundBlock || GotWorklistLimit) && "Current block not in cache?"); } // Okay, we're done now. If we added new values to the cache, re-sort it. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 38fbac264430f..02aa2b36783d5 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1705,13 +1705,16 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); // Get the function symbol. - if (TM.getTargetTriple().isOSAIX()) { + if (!MAI->needsFunctionDescriptors()) { + CurrentFnSym = getSymbol(&MF.getFunction()); + } else { + assert(TM.getTargetTriple().isOSAIX() && + "Only AIX uses the function descriptor hooks."); // AIX is unique here in that the name of the symbol emitted for the // function body does not have the same name as the source function's // C-linkage name. - assert(MAI->needsFunctionDescriptors() && "AIX ABI is descriptor based."); assert(CurrentFnDescSym && "The function descriptor symbol needs to be" - " initalized first."); + " initalized first."); // Get the function entry point symbol. CurrentFnSym = @@ -1721,8 +1724,6 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { MCSectionXCOFF *FnEntryPointSec = cast(getObjFileLowering().SectionForGlobal(&F, TM)); cast(CurrentFnSym)->setContainingCsect(FnEntryPointSec); - } else { - CurrentFnSym = getSymbol(&MF.getFunction()); } CurrentFnSymForSize = CurrentFnSym; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index cc8f5a10ca07f..79fbe1db9d3fd 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -75,36 +75,7 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) { return false; Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - - // Give up if either DstReg or SrcReg is a physical register. - if (Register::isPhysicalRegister(DstReg) || - Register::isPhysicalRegister(SrcReg)) - return false; - - // Give up the types don't match. - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); - // Give up if one has a valid LLT, but the other doesn't. - if (DstTy.isValid() != SrcTy.isValid()) - return false; - // Give up if the types don't match. - if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy) - return false; - - // Get the register banks and classes. - const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg); - const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); - const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg); - const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg); - - // Replace if the register constraints match. - if ((SrcRC == DstRC) && (SrcBank == DstBank)) - return true; - // Replace if DstReg has no constraints. - if (!DstBank && !DstRC) - return true; - - return false; + return canReplaceReg(DstReg, SrcReg, MRI); } void CombinerHelper::applyCombineCopy(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 5396fcfc4824f..3af0705dff854 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2487,6 +2487,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return lowerFPTOUI(MI, TypeIdx, Ty); case G_FPTOSI: return lowerFPTOSI(MI); + case G_FPTRUNC: + return lowerFPTRUNC(MI, TypeIdx, Ty); case G_SMIN: case G_SMAX: case G_UMIN: @@ -4476,6 +4478,128 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { return Legalized; } +// f64 -> f16 conversion using round-to-nearest-even rounding mode. +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. + return UnableToLegalize; + + const unsigned ExpMask = 0x7ff; + const unsigned ExpBiasf64 = 1023; + const unsigned ExpBiasf16 = 15; + const LLT S32 = LLT::scalar(32); + const LLT S1 = LLT::scalar(1); + + auto Unmerge = MIRBuilder.buildUnmerge(S32, Src); + Register U = Unmerge.getReg(0); + Register UH = Unmerge.getReg(1); + + auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20)); + + // Subtract the fp64 exponent bias (1023) to get the real exponent and + // add the f16 bias (15) to get the biased exponent for the f16 format. + E = MIRBuilder.buildAdd( + S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16)); + E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask)); + + auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8)); + M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe)); + + auto MaskedSig = MIRBuilder.buildAnd(S32, UH, + MIRBuilder.buildConstant(S32, 0x1ff)); + MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U); + + auto Zero = MIRBuilder.buildConstant(S32, 0); + auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero); + auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0); + M = MIRBuilder.buildOr(S32, M, Lo40Set); + + // (M != 0 ? 0x0200 : 0) | 0x7c00; + auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200); + auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero); + auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero); + + auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00); + auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00); + + // N = M | (E << 12); + auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12)); + auto N = MIRBuilder.buildOr(S32, M, EShl12); + + // B = clamp(1-E, 0, 13); + auto One = MIRBuilder.buildConstant(S32, 1); + auto OneSubExp = MIRBuilder.buildSub(S32, One, E); + auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero); + B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13)); + + auto SigSetHigh = MIRBuilder.buildOr(S32, M, + MIRBuilder.buildConstant(S32, 0x1000)); + + auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B); + auto D0 = MIRBuilder.buildShl(S32, D, B); + + auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, + D0, SigSetHigh); + auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh); + D = MIRBuilder.buildOr(S32, D, D1); + + auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One); + auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N); + + auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7)); + V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2)); + + auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3, + MIRBuilder.buildConstant(S32, 3)); + auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3); + + auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3, + MIRBuilder.buildConstant(S32, 5)); + auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5); + + V1 = MIRBuilder.buildOr(S32, V0, V1); + V = MIRBuilder.buildAdd(S32, V, V1); + + auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, + E, MIRBuilder.buildConstant(S32, 30)); + V = MIRBuilder.buildSelect(S32, CmpEGt30, + MIRBuilder.buildConstant(S32, 0x7c00), V); + + auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, + E, MIRBuilder.buildConstant(S32, 1039)); + V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V); + + // Extract the sign bit. + auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16)); + Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000)); + + // Insert the sign bit + V = MIRBuilder.buildOr(S32, Sign, V); + + MIRBuilder.buildTrunc(Dst, V); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + const LLT S64 = LLT::scalar(64); + const LLT S16 = LLT::scalar(16); + + if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64) + return lowerFPTRUNC_F64_TO_F16(MI); + + return UnableToLegalize; +} + static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { case TargetOpcode::G_SMIN: diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 5f72974b31ec3..d29e9546be0bf 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -155,6 +155,20 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I, return true; } +bool llvm::canReplaceReg(Register DstReg, Register SrcReg, + MachineRegisterInfo &MRI) { + // Give up if either DstReg or SrcReg is a physical register. + if (DstReg.isPhysical() || SrcReg.isPhysical()) + return false; + // Give up if the types don't match. + if (MRI.getType(DstReg) != MRI.getType(SrcReg)) + return false; + // Replace if either DstReg has no constraints or the register + // constraints match. + return !MRI.getRegClassOrRegBank(DstReg) || + MRI.getRegClassOrRegBank(DstReg) == MRI.getRegClassOrRegBank(SrcReg); +} + bool llvm::isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI) { // If we can move an instruction, we can remove it. Otherwise, it has diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp index 7dbd830666fb8..c19ed1f8f71da 100644 --- a/llvm/lib/CodeGen/ParallelCG.cpp +++ b/llvm/lib/CodeGen/ParallelCG.cpp @@ -51,7 +51,7 @@ std::unique_ptr llvm::splitCodeGen( // Create ThreadPool in nested scope so that threads will be joined // on destruction. { - ThreadPool CodegenThreadPool(OSs.size()); + ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size())); int ThreadCount = 0; SplitModule( diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 03919c805130c..715ad24b55214 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -2446,7 +2446,7 @@ bool DWARFLinker::link() { } EmitLambda(); } else { - ThreadPool Pool(2); + ThreadPool Pool(hardware_concurrency(2)); Pool.async(AnalyzeAll); Pool.async(CloneAll); Pool.wait(); diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index e01b6b6ebc0cc..c3bf71f21cda2 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -445,7 +445,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads) { // Now parse all DIEs in case we have cross compile unit references in a // thread pool. - ThreadPool pool(NumThreads); + ThreadPool pool(hardware_concurrency(NumThreads)); for (const auto &CU : DICtx.compile_units()) pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); }); pool.wait(); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index f81e584b3b2dc..4218ca4e481f7 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -157,7 +157,8 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err) if (S.NumCompileThreads > 0) { TransformLayer->setCloneToNewContextOnEmit(true); - CompileThreads = std::make_unique(S.NumCompileThreads); + CompileThreads = + std::make_unique(hardware_concurrency(S.NumCompileThreads)); ES->setDispatchMaterialization( [this](JITDylib &JD, std::unique_ptr MU) { // FIXME: Switch to move capture once we have c++14. diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index 7bdc16af9014c..f4c4bdfad0b64 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -286,6 +286,16 @@ TraitSet llvm::omp::getOpenMPContextTraitSetKind(StringRef S) { #include "llvm/Frontend/OpenMP/OMPKinds.def" .Default(TraitSet::invalid); } + +TraitSet +llvm::omp::getOpenMPContextTraitSetForSelector(TraitSelector Selector) { + switch (Selector) { +#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp) \ + case TraitSelector::Enum: \ + return TraitSet::TraitSetEnum; +#include "llvm/Frontend/OpenMP/OMPKinds.def" + } +} TraitSet llvm::omp::getOpenMPContextTraitSetForProperty(TraitProperty Property) { switch (Property) { @@ -333,11 +343,10 @@ StringRef llvm::omp::getOpenMPContextTraitSelectorName(TraitSelector Kind) { llvm_unreachable("Unknown trait selector!"); } -TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind( - TraitSet Set, TraitSelector Selector, StringRef S) { +TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(TraitSet Set, + StringRef S) { #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ - if (Set == TraitSet::TraitSetEnum && \ - Selector == TraitSelector::TraitSelectorEnum && Str == S) \ + if (Set == TraitSet::TraitSetEnum && Str == S) \ return TraitProperty::Enum; #include "llvm/Frontend/OpenMP/OMPKinds.def" return TraitProperty::invalid; @@ -398,3 +407,36 @@ bool llvm::omp::isValidTraitPropertyForTraitSetAndSelector( } llvm_unreachable("Unknown trait property!"); } + +std::string llvm::omp::listOpenMPContextTraitSets() { + std::string S; +#define OMP_TRAIT_SET(Enum, Str) \ + if (Str != "invalid") \ + S.append("'").append(Str).append("'").append(" "); +#include "llvm/Frontend/OpenMP/OMPKinds.def" + S.pop_back(); + return S; +} + +std::string llvm::omp::listOpenMPContextTraitSelectors(TraitSet Set) { + std::string S; +#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp) \ + if (TraitSet::TraitSetEnum == Set && Str != "Invalid") \ + S.append("'").append(Str).append("'").append(" "); +#include "llvm/Frontend/OpenMP/OMPKinds.def" + S.pop_back(); + return S; +} + +std::string +llvm::omp::listOpenMPContextTraitProperties(TraitSet Set, + TraitSelector Selector) { + std::string S; +#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ + if (TraitSet::TraitSetEnum == Set && \ + TraitSelector::TraitSelectorEnum == Selector && Str != "invalid") \ + S.append("'").append(Str).append("'").append(" "); +#include "llvm/Frontend/OpenMP/OMPKinds.def" + S.pop_back(); + return S; +} diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 0a43e17c358c8..d92943d6975b8 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -31,10 +31,8 @@ static cl::opt PropagateAttrs("propagate-attrs", cl::init(true), cl::Hidden, cl::desc("Propagate attributes in index")); -// FIXME: Enable again when thin link compile time regressions understood and -// addressed static cl::opt ImportConstantsWithRefs( - "import-constants-with-refs", cl::init(false), cl::Hidden, + "import-constants-with-refs", cl::init(true), cl::Hidden, cl::desc("Import constant global variables with references")); FunctionSummary FunctionSummary::ExternalNode = diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 1d23c6bab36d5..f8affcb20ceff 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -477,8 +477,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel, LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = - createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); + this->Backend = createInProcessThinBackend(); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1095,7 +1094,8 @@ class InProcessThinBackend : public ThinBackendProc { const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool(ThinLTOParallelismLevel), + BackendThreadPool( + heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), AddStream(std::move(AddStream)), Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index b85471555b092..ec57744cf4803 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -375,7 +375,8 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream, void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, std::unique_ptr Mod) { - ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel); + ThreadPool CodegenThreadPool( + heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel)); unsigned ThreadCount = 0; const Target *T = &TM->getTarget(); diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index a4f270240005c..152f0afcf12ea 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -80,8 +80,8 @@ extern cl::opt RemarksFormat; namespace { -static cl::opt - ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency())); +// Default to using one job per hardware core in the system +static cl::opt ThreadCount("threads", cl::init(0)); // Simple helper to save temporary files for debug. static void saveTempBitcode(const Module &TheModule, StringRef TempDir, @@ -1042,7 +1042,7 @@ void ThinLTOCodeGenerator::run() { // Parallel optimizer + codegen { - ThreadPool Pool(ThreadCount); + ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount)); for (auto IndexCount : ModulesOrdering) { auto &Mod = Modules[IndexCount]; Pool.async([&](int count) { diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index ef38c1c09413a..7e772b2b1378a 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1266,7 +1266,7 @@ StringRef sys::getHostCPUName() { return "generic"; } // On Linux, the number of physical cores can be computed from /proc/cpuinfo, // using the number of unique physical/core id pairs. The following // implementation reads the /proc/cpuinfo format on an x86_64 system. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be // mmapped because it appears to have 0 size. llvm::ErrorOr> Text = @@ -1312,7 +1312,7 @@ static int computeHostNumPhysicalCores() { #include // Gets the number of *physical cores* on the machine. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { uint32_t count; size_t len = sizeof(count); sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0); @@ -1326,6 +1326,9 @@ static int computeHostNumPhysicalCores() { } return count; } +#elif defined(_WIN32) +// Defined in llvm/lib/Support/Windows/Threading.inc +int computeHostNumPhysicalCores(); #else // On other systems, return -1 to indicate unknown. static int computeHostNumPhysicalCores() { return -1; } diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp index 523665d14b029..0272a53beb393 100644 --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -39,20 +39,21 @@ class Executor { /// in filo order. class ThreadPoolExecutor : public Executor { public: - explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) { + explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) { + unsigned ThreadCount = S.compute_thread_count(); // Spawn all but one of the threads in another thread as spawning threads // can take a while. Threads.reserve(ThreadCount); Threads.resize(1); std::lock_guard Lock(Mutex); - Threads[0] = std::thread([&, ThreadCount] { - for (unsigned i = 1; i < ThreadCount; ++i) { - Threads.emplace_back([=] { work(); }); + Threads[0] = std::thread([this, ThreadCount, S] { + for (unsigned I = 1; I < ThreadCount; ++I) { + Threads.emplace_back([=] { work(S, I); }); if (Stop) break; } ThreadsCreated.set_value(); - work(); + work(S, 0); }); } @@ -90,7 +91,8 @@ class ThreadPoolExecutor : public Executor { } private: - void work() { + void work(ThreadPoolStrategy S, unsigned ThreadID) { + S.apply_thread_strategy(ThreadID); while (true) { std::unique_lock Lock(Mutex); Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp index 40982d777914d..5aa5815d7272c 100644 --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -20,16 +20,15 @@ using namespace llvm; #if LLVM_ENABLE_THREADS -// Default to hardware_concurrency -ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {} - -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0), EnableFlag(true) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), EnableFlag(true), + ThreadCount(S.compute_thread_count()) { // Create ThreadCount threads that will loop forever, wait on QueueCondition // for tasks to be queued or the Pool to be destroyed. Threads.reserve(ThreadCount); for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) { - Threads.emplace_back([&] { + Threads.emplace_back([S, ThreadID, this] { + S.apply_thread_strategy(ThreadID); while (true) { PackagedTaskTy Task; { @@ -108,12 +107,10 @@ ThreadPool::~ThreadPool() { #else // LLVM_ENABLE_THREADS Disabled -ThreadPool::ThreadPool() : ThreadPool(0) {} - // No threads are launched, issue a warning if ThreadCount is not 0 -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0) { - if (ThreadCount) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), ThreadCount(S.compute_thread_count()) { + if (ThreadCount != 1) { errs() << "Warning: request a ThreadPool with " << ThreadCount << " threads, but LLVM_ENABLE_THREADS has been turned off\n"; } @@ -138,8 +135,6 @@ std::shared_future ThreadPool::asyncImpl(TaskTy Task) { return Future; } -ThreadPool::~ThreadPool() { - wait(); -} +ThreadPool::~ThreadPool() { wait(); } #endif diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp index 48750cef5ec22..de5adaddd9d38 100644 --- a/llvm/lib/Support/Threading.cpp +++ b/llvm/lib/Support/Threading.cpp @@ -45,10 +45,6 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData, Fn(UserData); } -unsigned llvm::heavyweight_hardware_concurrency() { return 1; } - -unsigned llvm::hardware_concurrency() { return 1; } - uint64_t llvm::get_threadid() { return 0; } uint32_t llvm::get_max_thread_name_length() { return 0; } @@ -57,6 +53,13 @@ void llvm::set_thread_name(const Twine &Name) {} void llvm::get_thread_name(SmallVectorImpl &Name) { Name.clear(); } +llvm::BitVector llvm::get_thread_affinity_mask() { return {}; } + +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + // When threads are disabled, ensure clients will loop at least once. + return 1; +} + #if LLVM_ENABLE_THREADS == 0 void llvm::llvm_execute_on_thread_async( llvm::unique_function Func, @@ -78,30 +81,19 @@ void llvm::llvm_execute_on_thread_async( #else -#include -unsigned llvm::heavyweight_hardware_concurrency() { - // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use - // `std::thread` directly instead of `llvm::thread` (and indeed, doing so - // allows us to not define `thread` in the llvm namespace, which conflicts - // with some platforms such as FreeBSD whose headers also define a struct - // called `thread` in the global namespace which can cause ambiguity due to - // ADL. - int NumPhysical = sys::getHostNumPhysicalCores(); - if (NumPhysical == -1) - return std::thread::hardware_concurrency(); - return NumPhysical; -} +int computeHostNumHardwareThreads(); -unsigned llvm::hardware_concurrency() { -#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) - cpu_set_t Set; - if (sched_getaffinity(0, sizeof(Set), &Set)) - return CPU_COUNT(&Set); -#endif - // Guard against std::thread::hardware_concurrency() returning 0. - if (unsigned Val = std::thread::hardware_concurrency()) - return Val; - return 1; +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads() + : sys::getHostNumPhysicalCores(); + if (MaxThreadCount <= 0) + MaxThreadCount = 1; + + // No need to create more threads than there are hardware threads, it would + // uselessly induce more context-switching and cache eviction. + if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount) + return MaxThreadCount; + return ThreadsRequested; } namespace { diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index afb887fc10960..8cacaa83e961a 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -267,3 +267,27 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { #endif return SetThreadPriorityResult::FAILURE; } + +#include + +int computeHostNumHardwareThreads() { +#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) + cpu_set_t Set; + if (sched_getaffinity(0, sizeof(Set), &Set)) + return CPU_COUNT(&Set); +#endif + // Guard against std::thread::hardware_concurrency() returning 0. + if (unsigned Val = std::thread::hardware_concurrency()) + return Val; + return 1; +} + +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const {} + +llvm::BitVector llvm::get_thread_affinity_mask() { + // FIXME: Implement + llvm_unreachable("Not implemented!"); +} + +unsigned llvm::get_cpus() { return 1; } diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc index 9456efa686ffc..eb92296212263 100644 --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -16,6 +16,8 @@ #include "WindowsSupport.h" #include +#include + // Windows will at times define MemoryFence. #ifdef MemoryFence #undef MemoryFence @@ -122,3 +124,163 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { ? SetThreadPriorityResult::SUCCESS : SetThreadPriorityResult::FAILURE; } + +struct ProcessorGroup { + unsigned ID; + unsigned AllThreads; + unsigned UsableThreads; + unsigned ThreadsPerCore; + uint64_t Affinity; +}; + +template +static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) { + DWORD Len = 0; + BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len); + if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return false; + } + auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len); + R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len); + if (R) { + auto *End = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len); + for (auto *Curr = Info; Curr < End; + Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr + + Curr->Size)) { + if (Curr->Relationship != Relationship) + continue; + Fn(Curr); + } + } + free(Info); + return true; +} + +static ArrayRef getProcessorGroups() { + auto computeGroups = []() { + SmallVector Groups; + + auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + GROUP_RELATIONSHIP &El = ProcInfo->Group; + for (unsigned J = 0; J < El.ActiveGroupCount; ++J) { + ProcessorGroup G; + G.ID = Groups.size(); + G.AllThreads = El.GroupInfo[J].MaximumProcessorCount; + G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount; + assert(G.UsableThreads <= 64); + G.Affinity = El.GroupInfo[J].ActiveProcessorMask; + Groups.push_back(G); + } + }; + + if (!IterateProcInfo(RelationGroup, HandleGroup)) + return std::vector(); + + auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor; + assert(El.GroupCount == 1); + unsigned NumHyperThreads = 1; + // If the flag is set, each core supports more than one hyper-thread. + if (El.Flags & LTP_PC_SMT) + NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count(); + unsigned I = El.GroupMask[0].Group; + Groups[I].ThreadsPerCore = NumHyperThreads; + }; + + if (!IterateProcInfo(RelationProcessorCore, HandleProc)) + return std::vector(); + + // If there's an affinity mask set on one of the CPUs, then assume the user + // wants to constrain the current process to only a single CPU. + for (auto &G : Groups) { + if (G.UsableThreads != G.AllThreads) { + ProcessorGroup NewG{G}; + Groups.clear(); + Groups.push_back(NewG); + break; + } + } + + return std::vector(Groups.begin(), Groups.end()); + }; + static auto Groups = computeGroups(); + return ArrayRef(Groups); +} + +template +static unsigned aggregate(R &&Range, UnaryPredicate P) { + unsigned I{}; + for (const auto &It : Range) + I += P(It); + return I; +} + +// for sys::getHostNumPhysicalCores +int computeHostNumPhysicalCores() { + static unsigned Cores = + aggregate(getProcessorGroups(), [](const ProcessorGroup &G) { + return G.UsableThreads / G.ThreadsPerCore; + }); + return Cores; +} + +int computeHostNumHardwareThreads() { + static unsigned Threads = + aggregate(getProcessorGroups(), + [](const ProcessorGroup &G) { return G.UsableThreads; }); + return Threads; +} + +// Assign the current thread to a more appropriate CPU socket or CPU group +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const { + ArrayRef Groups = getProcessorGroups(); + + assert(ThreadPoolNum < compute_thread_count() && + "The thread index is not within thread strategy's range!"); + + // In this mode, the ThreadNumber represents the core number, not the + // hyper-thread number. Assumes all NUMA groups have the same amount of + // hyper-threads. + if (!UseHyperThreads) + ThreadPoolNum *= Groups[0].ThreadsPerCore; + + unsigned ThreadRangeStart = 0; + for (unsigned I = 0; I < Groups.size(); ++I) { + const ProcessorGroup &G = Groups[I]; + if (ThreadPoolNum >= ThreadRangeStart && + ThreadPoolNum < ThreadRangeStart + G.UsableThreads) { + + GROUP_AFFINITY Affinity{}; + Affinity.Group = G.ID; + Affinity.Mask = G.Affinity; + SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); + } + ThreadRangeStart += G.UsableThreads; + } +} + +llvm::BitVector llvm::get_thread_affinity_mask() { + GROUP_AFFINITY Affinity{}; + GetThreadGroupAffinity(GetCurrentThread(), &Affinity); + + static unsigned All = + aggregate(getProcessorGroups(), + [](const ProcessorGroup &G) { return G.AllThreads; }); + + unsigned StartOffset = + aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) { + return G.ID < Affinity.Group ? G.AllThreads : 0; + }); + + llvm::BitVector V; + V.resize(All); + for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) { + if ((Affinity.Mask >> I) & 1) + V.set(StartOffset + I); + } + return V; +} + +unsigned llvm::get_cpus() { return getProcessorGroups().size(); } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 68b58b061765b..010cfe544b702 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -429,6 +429,57 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, default: break; + case AArch64::BSPv8i8: + case AArch64::BSPv16i8: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to BIT + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to BIF + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + } else { + // Expand to BSL, use additional move if required + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } else { + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 + : AArch64::ORRv16i8)) + .addReg(DstReg) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + } + } + MI.eraseFromParent(); + return true; + } + case AArch64::ADDWrr: case AArch64::SUBWrr: case AArch64::ADDXrr: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9736a18832c06..a64baa9f5b4d0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1287,7 +1287,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; + case AArch64ISD::BSP: return "AArch64ISD::BSP"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; @@ -10229,7 +10229,7 @@ static SDValue tryCombineToBSL(SDNode *N, } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f664484a88038..52728d5abd557 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -90,9 +90,9 @@ enum NodeType : unsigned { BICi, ORRi, - // Vector bit select: similar to ISD::VSELECT but not all bits within an + // Vector bitwise select: similar to ISD::VSELECT but not all bits within an // element must be identical. - BSL, + BSP, // Vector arithmetic negation NEG, @@ -166,7 +166,7 @@ enum NodeType : unsigned { // Vector bitwise negation NOT, - // Vector bitwise selection + // Vector bitwise insertion BIT, // Compare-and-branch diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index b2fa8a55c252d..43ceec94c98b9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5207,6 +5207,47 @@ class BaseSIMDThreeSameVectorTied size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[WriteV]>; + +multiclass SIMDLogicalThreeVectorPseudo { + def v8i8 : BaseSIMDThreeSameVectorPseudo; + def v16i8 : BaseSIMDThreeSameVectorPseudo; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector opc, string asm, SDPatternOperator OpNode> { @@ -5427,7 +5468,7 @@ multiclass SIMDLogicalThreeVector size, string asm, } multiclass SIMDLogicalThreeVectorTied size, - string asm, SDPatternOperator OpNode> { + string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 67c7039e46795..de92bd37b5042 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -468,7 +468,7 @@ def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -3955,33 +3955,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +// Pseudo bitwise select pattern BSP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSP : SIMDLogicalThreeVectorPseudo>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">; + +def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index 9f566d1c7079b..19ff13524fa8d 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -501,7 +501,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; // Q form - v16i8, v8i16, v4i32, v2i64 // ASIMD bitwise insert, Q-form -def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 798ecb7508c08..a79155dc06fb9 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -494,7 +494,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; // WriteV includes: // SHLL,SSHLL,USHLL // SLI,SRI -// BIF,BIT,BSL +// BIF,BIT,BSL,BSP // EXT // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN // XTN2 diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index d1734c455b2b4..08f562c1eaac7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -660,7 +660,7 @@ def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index d2284f9fa0b50..ade4493545e1f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -803,7 +803,7 @@ def : InstRW<[M4WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index df7402591e7b9..cfc5dfc9f49f1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -841,7 +841,7 @@ def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; -def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 697a0f69c58cb..f2cd83caffa2b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$") def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; @@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td index 4c60992e6351a..bc5ad0f8beced 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln : let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], - (instrs BIFv8i8, BITv8i8, BSLv8i8)>; + (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>; def KryoWrite_1cyc_X_X_75ln : SchedWriteRes<[KryoUnitX, KryoUnitX]> { let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_X_75ln], - (instrs BIFv16i8, BITv16i8, BSLv16i8)>; + (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>; def KryoWrite_0cyc_noRSV_11ln : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index e2a293c068774..40738976bdaa2 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^BIFv", "^BITv", "^BSLv")>; + (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>; // ASIMD count, D-form // ASIMD count, Q-form diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8840b0a180c09..49d5fbbbc1268 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -63,6 +64,21 @@ static cl::opt UseMul24Intrin( cl::ReallyHidden, cl::init(true)); +// Legalize 64-bit division by using the generic IR expansion. +static cl::opt ExpandDiv64InIR( + "amdgpu-codegenprepare-expand-div64", + cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + +// Leave all division operations as they are. This supersedes ExpandDiv64InIR +// and is used for testing the legalizer. +static cl::opt DisableIDivExpand( + "amdgpu-codegenprepare-disable-idiv-expansion", + cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor { const GCNSubtarget *ST = nullptr; @@ -160,16 +176,27 @@ class AMDGPUCodeGenPrepare : public FunctionPass, bool divHasSpecialOptimization(BinaryOperator &I, Value *Num, Value *Den) const; + int getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool Signed) const; /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, bool IsDiv, bool IsSigned) const; + Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den, unsigned NumBits, + bool IsDiv, bool IsSigned) const; + /// Expands 32 bit div or rem. Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den) const; + Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den) const; + void expandDivRem64(BinaryOperator &I) const; + /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant @@ -806,30 +833,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { return getMul64(Builder, LHS, RHS).second; } -// The fractional part of a float is enough to accurately represent up to -// a 24-bit signed integer. -Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den, - bool IsDiv, bool IsSigned) const { - assert(Num->getType()->isIntegerTy(32)); - +/// Figure out how many bits are really needed for this ddivision. \p AtLeast is +/// an optimization hint to bypass the second ComputeNumSignBits call if we the +/// first one is insufficient. Returns -1 on failure. +int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool IsSigned) const { const DataLayout &DL = Mod->getDataLayout(); unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); - if (LHSSignBits < 9) - return nullptr; + if (LHSSignBits < AtLeast) + return -1; unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); - if (RHSSignBits < 9) - return nullptr; - + if (RHSSignBits < AtLeast) + return -1; unsigned SignBits = std::min(LHSSignBits, RHSSignBits); - unsigned DivBits = 32 - SignBits; + unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; if (IsSigned) ++DivBits; + return DivBits; +} +// The fractional part of a float is enough to accurately represent up to +// a 24-bit signed integer. +Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const { + int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); + if (DivBits == -1) + return nullptr; + return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); +} + +Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + unsigned DivBits, + bool IsDiv, bool IsSigned) const { Type *I32Ty = Builder.getInt32Ty(); + Num = Builder.CreateTrunc(Num, I32Ty); + Den = Builder.CreateTrunc(Den, I32Ty); + Type *F32Ty = Builder.getFloatTy(); ConstantInt *One = Builder.getInt32(1); Value *JQ = One; @@ -901,13 +947,18 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Res = Builder.CreateSub(Num, Rem); } - // Extend in register from the number of bits this divide really is. - if (IsSigned) { - Res = Builder.CreateShl(Res, 32 - DivBits); - Res = Builder.CreateAShr(Res, 32 - DivBits); - } else { - ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); - Res = Builder.CreateAnd(Res, TruncMask); + if (DivBits != 0 && DivBits < 32) { + // Extend in register from the number of bits this divide really is. + if (IsSigned) { + int InRegBits = 32 - DivBits; + + Res = Builder.CreateShl(Res, InRegBits); + Res = Builder.CreateAShr(Res, InRegBits); + } else { + ConstantInt *TruncMask + = Builder.getInt32((UINT64_C(1) << DivBits) - 1); + Res = Builder.CreateAnd(Res, TruncMask); + } } return Res; @@ -981,8 +1032,8 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, } if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { - Res = Builder.CreateTrunc(Res, Ty); - return Res; + return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : + Builder.CreateZExtOrTrunc(Res, Ty); } ConstantInt *Zero = Builder.getInt32(0); @@ -1093,6 +1144,53 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, return Res; } +Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den) const { + if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) + return nullptr; // Keep it for later optimization. + + Instruction::BinaryOps Opc = I.getOpcode(); + + bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; + bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; + + int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); + if (NumDivBits == -1) + return nullptr; + + Value *Narrowed = nullptr; + if (NumDivBits <= 24) { + Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, + IsDiv, IsSigned); + } else if (NumDivBits <= 32) { + Narrowed = expandDivRem32(Builder, I, Num, Den); + } + + if (Narrowed) { + return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : + Builder.CreateZExt(Narrowed, Num->getType()); + } + + return nullptr; +} + +void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { + Instruction::BinaryOps Opc = I.getOpcode(); + // Do the general expansion. + if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { + expandDivisionUpTo64Bits(&I); + return; + } + + if (Opc == Instruction::URem || Opc == Instruction::SRem) { + expandRemainderUpTo64Bits(&I); + return; + } + + llvm_unreachable("not a division"); +} + bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; @@ -1108,9 +1206,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; + unsigned ScalarSize = Ty->getScalarSizeInBits(); + + SmallVector Div64ToExpand; + if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && - Ty->getScalarSizeInBits() <= 32) { + ScalarSize <= 64 && + !DisableIDivExpand) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); @@ -1122,13 +1225,35 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); - Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); - if (!NewElt) - NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + + Value *NewElt; + if (ScalarSize <= 32) { + NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); + if (!NewElt) + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + } else { + // See if this 64-bit division can be shrunk to 32/24-bits before + // producing the general expansion. + NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); + if (!NewElt) { + // The general 64-bit expansion introduces control flow and doesn't + // return the new value. Just insert a scalar copy and defer + // expanding it. + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + Div64ToExpand.push_back(cast(NewElt)); + } + } + NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { - NewDiv = expandDivRem32(Builder, I, Num, Den); + if (ScalarSize <= 32) + NewDiv = expandDivRem32(Builder, I, Num, Den); + else { + NewDiv = shrinkDivRem64(Builder, I, Num, Den); + if (!NewDiv) + Div64ToExpand.push_back(&I); + } } if (NewDiv) { @@ -1138,6 +1263,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { } } + if (ExpandDiv64InIR) { + // TODO: We get much worse code in specially handled constant cases. + for (BinaryOperator *Div : Div64ToExpand) { + expandDivRem64(*Div); + Changed = true; + } + } + return Changed; } @@ -1255,11 +1388,25 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { bool MadeChange = false; - for (BasicBlock &BB : F) { + Function::iterator NextBB; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { + BasicBlock *BB = &*FI; + NextBB = std::next(FI); + BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) { Next = std::next(I); + MadeChange |= visit(*I); + + if (Next != E) { // Control flow changed + BasicBlock *NextInstBB = Next->getParent(); + if (NextInstBB != BB) { + BB = NextInstBB; + E = BB->end(); + FE = F.end(); + } + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 3dcef2f2415af..f8fee8621a519 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -143,6 +143,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index be98f74de9536..1cf95c5b522f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -436,7 +436,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_FPTRUNC) .legalFor({{S32, S64}, {S16, S32}}) - .scalarize(0); + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_FPEXT) .legalFor({{S64, S32}, {S32, S16}}) @@ -597,7 +598,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); - // TODO: Expand for > s32 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) .legalFor({S32}) .clampScalar(0, S32, S32) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 59151a3346e61..22528e243a4f0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3096,9 +3096,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? case AMDGPU::G_AMDGPU_FFBH_U32: case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: + case AMDGPU::G_AMDGPU_RCP_IFLAG: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { @@ -3182,7 +3184,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: case AMDGPU::G_PTRTOINT: - case AMDGPU::G_BSWAP: case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 31c06ce0bfbfb..fb488d2b1aab1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -150,7 +150,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - if (DoesNotSupportXNACK && EnableXNACK) { + // Disable XNACK on targets where it is not enabled by default unless it is + // explicitly requested. + if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { ToggleFeature(AMDGPU::FeatureXNACK); EnableXNACK = false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e9679cdf95978..d250af225345f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -365,6 +365,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. + // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. setOperationAction(ISD::BSWAP, MVT::i64, Legal); setOperationAction(ISD::BSWAP, MVT::i32, Legal); @@ -467,7 +468,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SREM, MVT::i16, Promote); setOperationAction(ISD::UREM, MVT::i16, Promote); - setOperationAction(ISD::BSWAP, MVT::i16, Promote); setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); setOperationAction(ISD::CTTZ, MVT::i16, Promote); @@ -549,6 +549,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + // v_perm_b32 can handle either of these. + setOperationAction(ISD::BSWAP, MVT::i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v2i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); + // XXX - Do these do anything? Vector constants turn into build_vector. setOperationAction(ISD::Constant, MVT::v2i16, Legal); setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); @@ -3909,7 +3914,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4f16); + assert(VT == MVT::v4f16 || VT == MVT::v4i16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4018,6 +4023,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: case ISD::FNEG: case ISD::FCANONICALIZE: + case ISD::BSWAP: return splitUnaryVectorOp(Op, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9dd51bf4a27d9..beab2eb205fba 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1782,8 +1782,8 @@ def : GCNPat < def : GCNPat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 $a, $a, (i32 24)), - (V_ALIGNBIT_B32 $a, $a, (i32 8))) + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) >; // FIXME: This should have been narrowed to i32 during legalization. @@ -1809,8 +1809,9 @@ def : GCNPat < sub1) >; - -let SubtargetPredicate = isGFX8Plus in { +// FIXME: The AddedComplexity should not be needed, but in GlobalISel +// the BFI pattern ends up taking precedence without it. +let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { // Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) // // My reading of the manual suggests we should be using src0 for the @@ -1833,6 +1834,24 @@ def : GCNPat < sub1) >; +// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) +// The 12s emit 0s. +def : GCNPat < + (i16 (bswap i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) +>; + +def : GCNPat < + (i32 (zext (bswap i16:$a))), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) +>; + +// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) +def : GCNPat < + (v2i16 (bswap v2i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) +>; + } let OtherPredicates = [NoFP16Denormals] in { @@ -2194,6 +2213,12 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + class BufferLoadGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 7089ba2f77240..bef2fb349741c 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -1294,9 +1294,28 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc) { MCContext &Context = getParser().getContext(); const MCRegisterInfo *RI = getContext().getRegisterInfo(); - std::string r = "r"; - std::string v = "v"; - std::string Colon = ":"; + const std::string r = "r"; + const std::string v = "v"; + const std::string Colon = ":"; + using RegPairVals = std::pair; + auto GetRegPair = [this, r](RegPairVals RegPair) { + const std::string R1 = r + utostr(RegPair.first); + const std::string R2 = r + utostr(RegPair.second); + + return std::make_pair(matchRegister(R1), matchRegister(R2)); + }; + auto GetScalarRegs = [RI, GetRegPair](unsigned RegPair) { + const unsigned Lower = RI->getEncodingValue(RegPair); + const RegPairVals RegPair_ = std::make_pair(Lower + 1, Lower); + + return GetRegPair(RegPair_); + }; + auto GetVecRegs = [GetRegPair](unsigned VecRegPair) { + const RegPairVals RegPair = + HexagonMCInstrInfo::GetVecRegPairIndices(VecRegPair); + + return GetRegPair(RegPair); + }; bool is32bit = false; // used to distinguish between CONST32 and CONST64 switch (Inst.getOpcode()) { @@ -1388,14 +1407,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" case Hexagon::A2_tfrp: { MCOperand &MO = Inst.getOperand(1); - unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); - std::string R1 = r + utostr(RegPairNum + 1); - StringRef Reg1(R1); - MO.setReg(matchRegister(Reg1)); - // Add a new operand for the second register in the pair. - std::string R2 = r + utostr(RegPairNum); - StringRef Reg2(R2); - Inst.addOperand(MCOperand::createReg(matchRegister(Reg2))); + const std::pair RegPair = GetScalarRegs(MO.getReg()); + MO.setReg(RegPair.first); + Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode(Hexagon::A2_combinew); break; } @@ -1403,14 +1417,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, case Hexagon::A2_tfrpt: case Hexagon::A2_tfrpf: { MCOperand &MO = Inst.getOperand(2); - unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); - std::string R1 = r + utostr(RegPairNum + 1); - StringRef Reg1(R1); - MO.setReg(matchRegister(Reg1)); - // Add a new operand for the second register in the pair. - std::string R2 = r + utostr(RegPairNum); - StringRef Reg2(R2); - Inst.addOperand(MCOperand::createReg(matchRegister(Reg2))); + const std::pair RegPair = GetScalarRegs(MO.getReg()); + MO.setReg(RegPair.first); + Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) ? Hexagon::C2_ccombinewt : Hexagon::C2_ccombinewf); @@ -1419,14 +1428,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, case Hexagon::A2_tfrptnew: case Hexagon::A2_tfrpfnew: { MCOperand &MO = Inst.getOperand(2); - unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); - std::string R1 = r + utostr(RegPairNum + 1); - StringRef Reg1(R1); - MO.setReg(matchRegister(Reg1)); - // Add a new operand for the second register in the pair. - std::string R2 = r + utostr(RegPairNum); - StringRef Reg2(R2); - Inst.addOperand(MCOperand::createReg(matchRegister(Reg2))); + const std::pair RegPair = GetScalarRegs(MO.getReg()); + MO.setReg(RegPair.first); + Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) ? Hexagon::C2_ccombinewnewt : Hexagon::C2_ccombinewnewf); @@ -1436,12 +1440,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)" case Hexagon::V6_vassignp: { MCOperand &MO = Inst.getOperand(1); - unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); - std::string R1 = v + utostr(RegPairNum + 1); - MO.setReg(MatchRegisterName(R1)); - // Add a new operand for the second register in the pair. - std::string R2 = v + utostr(RegPairNum); - Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2))); + const std::pair RegPair = GetVecRegs(MO.getReg()); + MO.setReg(RegPair.first); + Inst.addOperand(MCOperand::createReg(RegPair.second)); Inst.setOpcode(Hexagon::V6_vcombine); break; } diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index d71409de5e356..f3a87ef20a608 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -498,9 +498,13 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) { unsigned Producer = HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg(); - if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15) - Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0; - else if (SubregBit) + + if (HexagonMCInstrInfo::IsVecRegPair(Producer)) { + const bool Rev = HexagonMCInstrInfo::IsReverseVecRegPair(Producer); + const unsigned ProdPairIndex = + Rev ? Producer - Hexagon::WR0 : Producer - Hexagon::W0; + Producer = (ProdPairIndex << 1) + SubregBit + Hexagon::V0; + } else if (SubregBit) // Hexagon PRM 10.11 New-value operands // Nt[0] is reserved and should always be encoded as zero. return MCDisassembler::Fail; @@ -606,12 +610,16 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) { static const MCPhysReg HvxWRDecoderTable[] = { - Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, - Hexagon::W4, Hexagon::W5, Hexagon::W6, Hexagon::W7, - Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11, - Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15}; + Hexagon::W0, Hexagon::WR0, Hexagon::W1, Hexagon::WR1, Hexagon::W2, + Hexagon::WR2, Hexagon::W3, Hexagon::WR3, Hexagon::W4, Hexagon::WR4, + Hexagon::W5, Hexagon::WR5, Hexagon::W6, Hexagon::WR6, Hexagon::W7, + Hexagon::WR7, Hexagon::W8, Hexagon::WR8, Hexagon::W9, Hexagon::WR9, + Hexagon::W10, Hexagon::WR10, Hexagon::W11, Hexagon::WR11, Hexagon::W12, + Hexagon::WR12, Hexagon::W13, Hexagon::WR13, Hexagon::W14, Hexagon::WR14, + Hexagon::W15, Hexagon::WR15, + }; - return (DecodeRegisterClass(Inst, RegNo >> 1, HvxWRDecoderTable)); + return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable); } LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily. diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp index d55aeaf10852d..2cb3f7c6573e0 100644 --- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -172,6 +172,13 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF) Reserved.set(Hexagon::C8); Reserved.set(Hexagon::USR_OVF); + // Leveraging these registers will require more work to recognize + // the new semantics posed, Hi/LoVec patterns, etc. + // Note well: if enabled, they should be restricted to only + // where `HST.useHVXOps() && HST.hasV67Ops()` is true. + for (auto Reg : Hexagon_MC::GetVectRegRev()) + Reserved.set(Reg); + if (MF.getSubtarget().hasReservedR19()) Reserved.set(Hexagon::R19); diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td index c23b837bb62fc..ea39dc44d15be 100644 --- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -18,6 +18,12 @@ let Namespace = "Hexagon" in { let HWEncoding{4-0} = num; } + // These registers are used to preserve a distinction between + // vector register pairs of differing order. + class HexagonFakeReg : Register { + let isArtificial = 1; + } + class HexagonDoubleReg num, string n, list subregs, list alt = []> : RegisterWithSubRegs { @@ -30,6 +36,13 @@ let Namespace = "Hexagon" in { class Ri num, string n, list alt = []> : HexagonReg; + // Rp - false/pseudo registers. These registers are used + // to provide a distinct set of aliases for both styles of vector + // register pairs without encountering subregister indexing constraints. + class R_fake : + HexagonFakeReg; + + // Rf - 32-bit floating-point registers. class Rf num, string n> : HexagonReg; @@ -81,6 +94,7 @@ let Namespace = "Hexagon" in { def isub_hi : SubRegIndex<32, 32>; def vsub_lo : SubRegIndex<512>; def vsub_hi : SubRegIndex<512, 512>; + def vsub_fake: SubRegIndex<512>; def wsub_lo : SubRegIndex<1024>; def wsub_hi : SubRegIndex<1024, 1024>; def subreg_overflow : SubRegIndex<1, 0>; @@ -183,27 +197,49 @@ let Namespace = "Hexagon" in { foreach i = 0-31 in { def V#i : Ri, DwarfRegNum<[!add(i, 99)]>; + def VF#i : R_fake<"__"#!add(i,999999)>, DwarfRegNum<[!add(i, 999999)]>; + def VFR#i : R_fake<"__"#!add(i,9999999)>, DwarfRegNum<[!add(i, 9999999)]>; } def VTMP : Ri<0, "vtmp">, DwarfRegNum<[131]>; // Aliases of the V* registers used to hold double vec values. - let SubRegIndices = [vsub_lo, vsub_hi], CoveredBySubRegs = 1 in { - def W0 : Rd< 0, "v1:0", [V0, V1]>, DwarfRegNum<[99]>; - def W1 : Rd< 2, "v3:2", [V2, V3]>, DwarfRegNum<[101]>; - def W2 : Rd< 4, "v5:4", [V4, V5]>, DwarfRegNum<[103]>; - def W3 : Rd< 6, "v7:6", [V6, V7]>, DwarfRegNum<[105]>; - def W4 : Rd< 8, "v9:8", [V8, V9]>, DwarfRegNum<[107]>; - def W5 : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>; - def W6 : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>; - def W7 : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>; - def W8 : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>; - def W9 : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>; - def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>; - def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>; - def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>; - def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>; - def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>; - def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>; + let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in { + def W0 : Rd< 0, "v1:0", [V0, V1, VF0]>, DwarfRegNum<[99]>; + def W1 : Rd< 2, "v3:2", [V2, V3, VF1]>, DwarfRegNum<[101]>; + def W2 : Rd< 4, "v5:4", [V4, V5, VF2]>, DwarfRegNum<[103]>; + def W3 : Rd< 6, "v7:6", [V6, V7, VF3]>, DwarfRegNum<[105]>; + def W4 : Rd< 8, "v9:8", [V8, V9, VF4]>, DwarfRegNum<[107]>; + def W5 : Rd<10, "v11:10", [V10, V11, VF5]>, DwarfRegNum<[109]>; + def W6 : Rd<12, "v13:12", [V12, V13, VF6]>, DwarfRegNum<[111]>; + def W7 : Rd<14, "v15:14", [V14, V15, VF7]>, DwarfRegNum<[113]>; + def W8 : Rd<16, "v17:16", [V16, V17, VF8]>, DwarfRegNum<[115]>; + def W9 : Rd<18, "v19:18", [V18, V19, VF9]>, DwarfRegNum<[117]>; + def W10 : Rd<20, "v21:20", [V20, V21, VF10]>, DwarfRegNum<[119]>; + def W11 : Rd<22, "v23:22", [V22, V23, VF11]>, DwarfRegNum<[121]>; + def W12 : Rd<24, "v25:24", [V24, V25, VF12]>, DwarfRegNum<[123]>; + def W13 : Rd<26, "v27:26", [V26, V27, VF13]>, DwarfRegNum<[125]>; + def W14 : Rd<28, "v29:28", [V28, V29, VF14]>, DwarfRegNum<[127]>; + def W15 : Rd<30, "v31:30", [V30, V31, VF15]>, DwarfRegNum<[129]>; + } + + // Reverse Aliases of the V* registers used to hold double vec values. + let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in { + def WR0 : Rd< 1, "v0:1", [V0, V1, VFR0]>, DwarfRegNum<[161]>; + def WR1 : Rd< 3, "v2:3", [V2, V3, VFR1]>, DwarfRegNum<[162]>; + def WR2 : Rd< 5, "v4:5", [V4, V5, VFR2]>, DwarfRegNum<[163]>; + def WR3 : Rd< 7, "v6:7", [V6, V7, VFR3]>, DwarfRegNum<[164]>; + def WR4 : Rd< 9, "v8:9", [V8, V9, VFR4]>, DwarfRegNum<[165]>; + def WR5 : Rd<11, "v10:11", [V10, V11, VFR5]>, DwarfRegNum<[166]>; + def WR6 : Rd<13, "v12:13", [V12, V13, VFR6]>, DwarfRegNum<[167]>; + def WR7 : Rd<15, "v14:15", [V14, V15, VFR7]>, DwarfRegNum<[168]>; + def WR8 : Rd<17, "v16:17", [V16, V17, VFR8]>, DwarfRegNum<[169]>; + def WR9 : Rd<19, "v18:19", [V18, V19, VFR9]>, DwarfRegNum<[170]>; + def WR10: Rd<21, "v20:21", [V20, V21, VFR10]>, DwarfRegNum<[171]>; + def WR11: Rd<23, "v22:23", [V22, V23, VFR11]>, DwarfRegNum<[172]>; + def WR12: Rd<25, "v24:25", [V24, V25, VFR12]>, DwarfRegNum<[173]>; + def WR13: Rd<27, "v26:27", [V26, V27, VFR13]>, DwarfRegNum<[174]>; + def WR14: Rd<29, "v28:29", [V28, V29, VFR14]>, DwarfRegNum<[175]>; + def WR15: Rd<31, "v30:31", [V30, V31, VFR15]>, DwarfRegNum<[176]>; } // Aliases of the V* registers used to hold quad vec values. @@ -314,7 +350,7 @@ def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512, } def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024, - (add (sequence "W%u", 0, 15))> { + (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> { let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>; } @@ -365,6 +401,10 @@ def CtrRegs : RegisterClass<"Hexagon", [i32], 32, FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI, M0, M1, USR)>; +let Size = 64 in +def VectRegRev : RegisterClass<"Hexagon", [i64], 64, + (add (sequence "WR%u", 0, 15))>; + let isAllocatable = 0 in def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp index 65a8dcd75bdca..fbc5e5c344eda 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp @@ -71,9 +71,10 @@ class HexagonVectorPrint : public MachineFunctionPass { char HexagonVectorPrint::ID = 0; static bool isVecReg(unsigned Reg) { - return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) - || (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) - || (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3); + return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) || + (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) || + (Reg >= Hexagon::WR0 && Reg <= Hexagon::WR15) || + (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3); } static std::string getStringReg(unsigned R) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 8b262bd0248e0..52c56d6db5242 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -81,6 +81,9 @@ void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg, if (!MCSubRegIterator(*SRI, &RI).isValid()) // Skip super-registers used indirectly. Uses.insert(*SRI); + + if (HexagonMCInstrInfo::IsReverseVecRegPair(R)) + ReversePairs.insert(R); } void HexagonMCChecker::init(MCInst const &MCI) { @@ -133,6 +136,9 @@ void HexagonMCChecker::init(MCInst const &MCI) { if (R == Hexagon::C8) R = Hexagon::USR; + if (HexagonMCInstrInfo::IsReverseVecRegPair(R)) + ReversePairs.insert(R); + // Note register definitions, direct ones as well as indirect side-effects. // Super-registers are not tracked directly, but their components. for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); @@ -192,7 +198,7 @@ HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCRegisterInfo const &ri, bool ReportErrors) : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI), - ReportErrors(ReportErrors) { + ReportErrors(ReportErrors), ReversePairs() { init(); } @@ -200,7 +206,10 @@ HexagonMCChecker::HexagonMCChecker(HexagonMCChecker const &Other, MCSubtargetInfo const &STI, bool CopyReportErrors) : Context(Other.Context), MCB(Other.MCB), RI(Other.RI), MCII(Other.MCII), - STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false) {} + STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false), + ReversePairs() { + init(); +} bool HexagonMCChecker::check(bool FullCheck) { bool chkP = checkPredicates(); @@ -218,8 +227,9 @@ bool HexagonMCChecker::check(bool FullCheck) { bool chkAXOK = checkAXOK(); bool chkCofMax1 = checkCOFMax1(); bool chkHWLoop = checkHWLoop(); + bool chkLegalVecRegPair = checkLegalVecRegPair(); bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl && - chkAXOK && chkCofMax1 && chkHWLoop; + chkAXOK && chkCofMax1 && chkHWLoop && chkLegalVecRegPair; return chk; } @@ -729,3 +739,16 @@ void HexagonMCChecker::reportWarning(Twine const &Msg) { if (ReportErrors) Context.reportWarning(MCB.getLoc(), Msg); } + +bool HexagonMCChecker::checkLegalVecRegPair() { + const bool IsPermitted = STI.getFeatureBits()[Hexagon::ArchV67]; + const bool HasReversePairs = ReversePairs.size() != 0; + + if (!IsPermitted && HasReversePairs) { + for (auto R : ReversePairs) + reportError("register pair `" + Twine(RI.getName(R)) + + "' is not permitted for this architecture"); + return false; + } + return true; +} diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index bc55ade9ccd78..00afdb664ba51 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -72,6 +72,10 @@ class HexagonMCChecker { using ReadOnlyIterator = std::set::iterator; std::set ReadOnly; + // Contains the vector-pair-registers with the even number + // first ("v0:1", e.g.) used/def'd in this packet. + std::set ReversePairs; + void init(); void init(MCInst const &); void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue); @@ -94,6 +98,7 @@ class HexagonMCChecker { bool checkAXOK(); bool checkHWLoop(); bool checkCOFMax1(); + bool checkLegalVecRegPair(); static void compoundRegisterMap(unsigned &); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 95e23c99868a4..36800b4279437 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -391,15 +391,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, static bool RegisterMatches(unsigned Consumer, unsigned Producer, unsigned Producer2) { - if (Consumer == Producer) - return true; - if (Consumer == Producer2) - return true; - // Calculate if we're a single vector consumer referencing a double producer - if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15) - if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31) - return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0); - return false; + return (Consumer == Producer) || (Consumer == Producer2) || + HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(Producer, + Consumer); } /// EncodeSingleInstruction - Emit a single @@ -735,7 +729,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, unsigned SOffset = 0; unsigned VOffset = 0; unsigned UseReg = MO.getReg(); - unsigned DefReg1, DefReg2; + unsigned DefReg1 = Hexagon::NoRegister; + unsigned DefReg2 = Hexagon::NoRegister; auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle); const MCOperand *I = Instrs.begin() + State.Index - 1; @@ -746,7 +741,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, if (HexagonMCInstrInfo::isImmext(Inst)) continue; - DefReg1 = DefReg2 = 0; + DefReg1 = Hexagon::NoRegister; + DefReg2 = Hexagon::NoRegister; ++SOffset; if (HexagonMCInstrInfo::isVector(MCII, Inst)) { // Vector instructions don't count scalars. diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index 4f8a432562196..f9f342a07f6dd 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -676,6 +676,45 @@ bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) { return (Flags & outerLoopMask) != 0; } +bool HexagonMCInstrInfo::IsVecRegPair(unsigned VecReg) { + return (VecReg >= Hexagon::W0 && VecReg <= Hexagon::W15) || + (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15); +} + +bool HexagonMCInstrInfo::IsReverseVecRegPair(unsigned VecReg) { + return (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15); +} + +bool HexagonMCInstrInfo::IsVecRegSingle(unsigned VecReg) { + return (VecReg >= Hexagon::V0 && VecReg <= Hexagon::V31); +} + +std::pair +HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) { + assert(IsVecRegPair(VecRegPair) && + "VecRegPair must be a vector register pair"); + + const bool IsRev = IsReverseVecRegPair(VecRegPair); + const unsigned PairIndex = + 2 * (IsRev ? VecRegPair - Hexagon::WR0 : VecRegPair - Hexagon::W0); + + return IsRev ? std::make_pair(PairIndex, PairIndex + 1) + : std::make_pair(PairIndex + 1, PairIndex); +} + +bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(unsigned Producer, + unsigned Consumer) { + if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) { + const unsigned ProdPairIndex = IsReverseVecRegPair(Producer) + ? Producer - Hexagon::WR0 + : Producer - Hexagon::W0; + const unsigned ConsumerSingleIndex = (Consumer - Hexagon::V0) >> 1; + + return ConsumerSingleIndex == ProdPairIndex; + } + return false; +} + bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -971,9 +1010,8 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer, unsigned Producer2) { // If we're a single vector consumer of a double producer, set subreg bit // based on if we're accessing the lower or upper register component - if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15) - if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31) - return (Consumer - Hexagon::V0) & 0x1; + if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) + return (Consumer - Hexagon::V0) & 0x1; if (Producer2 != Hexagon::NoRegister) return Consumer == Producer; return 0; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 70022aaad7122..7b3c079880f8d 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -351,6 +351,16 @@ bool subInstWouldBeExtended(MCInst const &potentialDuplex); unsigned SubregisterBit(unsigned Consumer, unsigned Producer, unsigned Producer2); +bool IsVecRegSingle(unsigned VecReg); +bool IsVecRegPair(unsigned VecReg); +bool IsReverseVecRegPair(unsigned VecReg); +bool IsSingleConsumerRefPairProducer(unsigned Producer, unsigned Consumer); + +/// Returns an ordered pair of the constituent register ordinals for +/// each of the elements of \a VecRegPair. For example, Hexagon::W0 ("v0:1") +/// returns { 0, 1 } and Hexagon::W1 ("v3:2") returns { 3, 2 }. +std::pair GetVecRegPairIndices(unsigned VecRegPair); + // Attempt to find and replace compound pairs void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCContext &Context, MCInst &MCI); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index ac5ba87c798da..cd721999a110a 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -532,6 +532,10 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) { return F->second; } +llvm::ArrayRef Hexagon_MC::GetVectRegRev() { + return makeArrayRef(VectRegRev); +} + namespace { class HexagonMCInstrAnalysis : public MCInstrAnalysis { public: diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index 6cc6f51ab12c4..a089abc3bd0c4 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include #include @@ -82,6 +83,8 @@ namespace Hexagon_MC { void addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS); unsigned GetELFFlags(const MCSubtargetInfo &STI); + + llvm::ArrayRef GetVectRegRev(); } MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp index 225cfa0cc4ef7..754cc94062692 100644 --- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -653,10 +653,10 @@ void MipsRegisterBankInfo::setRegBank(MachineInstr &MI, static void combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner, - MachineInstr &MI) { + MachineInstr &MI, GISelObserverWrapper &Observer) { SmallVector UpdatedDefs; SmallVector DeadInstrs; - ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs); + ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs, Observer); for (MachineInstr *DeadMI : DeadInstrs) DeadMI->eraseFromParent(); } @@ -689,7 +689,7 @@ void MipsRegisterBankInfo::applyMappingImpl( // not be considered for regbank selection. RegBankSelect for mips // visits/makes corresponding G_MERGE first. Combine them here. if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) - combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI); + combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, WrapperObserver); // This G_MERGE will be combined away when its corresponding G_UNMERGE // gets regBankSelected. else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES) @@ -701,7 +701,7 @@ void MipsRegisterBankInfo::applyMappingImpl( return; } case TargetOpcode::G_UNMERGE_VALUES: - combineAwayG_UNMERGE_VALUES(ArtCombiner, MI); + combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, WrapperObserver); return; default: break; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index a135f100ba04b..027e2c2d45afb 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -2632,12 +2632,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // used to provide rounding control: use MXCSR.RC, encoded as 0b100. // It's consistent with the other FP instructions, which are usually // controlled by MXCSR. - InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4); + unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr + : X86::VCVTPS2PHrr; + InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4); // Move the lower 32-bits of ResultReg to another register of class GR32. + Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr + : X86::VMOVPDI2DIrr; ResultReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::VMOVPDI2DIrr), ResultReg) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(InputReg, RegState::Kill); // The result value is in the lower 16-bits of ResultReg. @@ -2645,19 +2648,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); } else { assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); - // Explicitly sign-extend the input to 32-bit. - InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, + // Explicitly zero-extend the input to 32-bit. + InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg, /*Kill=*/false); // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, InputReg, /*Kill=*/true); - InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); + unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr + : X86::VCVTPH2PSrr; + InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true); // The result value is in the lower 32-bits of ResultReg. // Emit an explicit copy from register class VR128 to register class FR32. - ResultReg = createResultReg(&X86::FR32RegClass); + ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(InputReg, RegState::Kill); diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 86d06f0fc7296..15745c10b780b 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -5651,7 +5651,7 @@ struct X86MemUnfoldTable { addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3) - // Index 2, folded broadcast + // Index 3, folded broadcast addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); // Sort the memory->reg unfold table. diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index ed38169bb06f8..f92566ba77ce4 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -81,10 +81,8 @@ class LoopRotateLegacyPass : public LoopPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - if (EnableMSSALoopDependency) { - AU.addRequired(); + if (EnableMSSALoopDependency) AU.addPreserved(); - } getLoopAnalysisUsage(AU); } @@ -101,8 +99,11 @@ class LoopRotateLegacyPass : public LoopPass { const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); Optional MSSAU; if (EnableMSSALoopDependency) { - MemorySSA *MSSA = &getAnalysis().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); + // Not requiring MemorySSA and getting it only if available will split + // the loop pass pipeline when LoopRotate is being run first. + auto *MSSAA = getAnalysisIfAvailable(); + if (MSSAA) + MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); } return LoopRotation(L, LI, TTI, AC, &DT, &SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll deleted file mode 100644 index 152cb175ef608..0000000000000 --- a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll +++ /dev/null @@ -1,173 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -gvn -S | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" -target triple = "x86_64-unknown-linux-gnu" - -declare void @llvm.memset.p0i8.i8(i8*, i8, i32, i1) -declare void @foo(i8*) - -define i8 @test(i1 %cmp) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = alloca i8 -; CHECK-NEXT: store i8 5, i8* [[P]] -; CHECK-NEXT: br label [[HEADER:%.*]] -; CHECK: header: -; CHECK-NEXT: [[V:%.*]] = phi i8 [ 5, [[ENTRY:%.*]] ], [ -5, [[ALIVE:%.*]] ] -; CHECK-NEXT: [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[I_INC:%.*]], [[ALIVE]] ] -; CHECK-NEXT: br i1 [[CMP:%.*]], label [[ALIVE]], label [[DEAD:%.*]] -; CHECK: dead: -; CHECK-NEXT: call void @foo(i8* [[P]]) -; CHECK-NEXT: [[I_1:%.*]] = add i8 [[I]], [[V]] -; CHECK-NEXT: br label [[ALIVE]] -; CHECK: alive: -; CHECK-NEXT: [[I_2:%.*]] = phi i8 [ [[I]], [[HEADER]] ], [ [[I_1]], [[DEAD]] ] -; CHECK-NEXT: store i8 -5, i8* [[P]] -; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[P]], i8 0, i32 1, i1 false) -; CHECK-NEXT: [[I_INC]] = add i8 [[I_2]], 1 -; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp ugt i8 [[I_INC]], 100 -; CHECK-NEXT: br i1 [[CMP_LOOP]], label [[EXIT:%.*]], label [[HEADER]] -; CHECK: exit: -; CHECK-NEXT: ret i8 0 -; - -entry: - %p = alloca i8 - %addr = getelementptr inbounds i8, i8* %p, i64 0 - store i8 5, i8* %addr - br label %header -header: - %i = phi i8 [0, %entry], [%i.inc, %backedge] - br i1 %cmp, label %alive, label %dead -dead: - call void @foo(i8* %p) - %v = load i8, i8* %addr, !invariant.load !1 - %i.1 = add i8 %i, %v - br label %alive -alive: - %i.2 = phi i8 [%i, %header], [%i.1, %dead] - store i8 -5, i8* %addr - br label %backedge -backedge: - call void @llvm.memset.p0i8.i8(i8 * align 1 %p, i8 0, i32 1, i1 false) - %i.inc = add i8 %i.2, 1 - %cmp.loop = icmp ugt i8 %i.inc, 100 - br i1 %cmp.loop, label %exit, label %header -exit: - %res = load i8, i8* %addr - ret i8 %res -} - -; Check that first two loads are not optimized out while the one marked with -; invariant.load reuses %res1 -define i8 @test2(i1 %cmp, i8 *%p) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[RES1:%.*]] = load i8, i8* [[P:%.*]] -; CHECK-NEXT: call void @foo(i8* [[P]]) -; CHECK-NEXT: br i1 [[CMP:%.*]], label [[B2:%.*]], label [[B1:%.*]] -; CHECK: b1: -; CHECK-NEXT: [[RES2:%.*]] = load i8, i8* [[P]] -; CHECK-NEXT: [[RES3:%.*]] = add i8 [[RES1]], [[RES2]] -; CHECK-NEXT: br label [[ALIVE:%.*]] -; CHECK: b2: -; CHECK-NEXT: [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]] -; CHECK-NEXT: br label [[ALIVE]] -; CHECK: alive: -; CHECK-NEXT: [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ] -; CHECK-NEXT: ret i8 [[RES_PHI]] -; - -entry: - %res1 = load i8, i8* %p - call void @foo(i8 *%p) - br i1 %cmp, label %b2, label %b1 -b1: - %res2 = load i8, i8* %p - %res3 = add i8 %res1, %res2 - br label %alive -b2: - %v = load i8, i8* %p, !invariant.load !1 - %res.dead = add i8 %v, %res1 - br label %alive -alive: - %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2] - ret i8 %res.phi -} - -; This is essentially the same test case as the above one but with %b1 and %b2 -; swapped in "br i1 %cmp, label %b1, label %b2" instruction. That helps us to -; ensure that results doesn't depend on visiting order. -define i8 @test3(i1 %cmp, i8 *%p) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[RES1:%.*]] = load i8, i8* [[P:%.*]] -; CHECK-NEXT: call void @foo(i8* [[P]]) -; CHECK-NEXT: br i1 [[CMP:%.*]], label [[B1:%.*]], label [[B2:%.*]] -; CHECK: b1: -; CHECK-NEXT: [[RES2:%.*]] = load i8, i8* [[P]] -; CHECK-NEXT: [[RES3:%.*]] = add i8 [[RES1]], [[RES2]] -; CHECK-NEXT: br label [[ALIVE:%.*]] -; CHECK: b2: -; CHECK-NEXT: [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]] -; CHECK-NEXT: br label [[ALIVE]] -; CHECK: alive: -; CHECK-NEXT: [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ] -; CHECK-NEXT: ret i8 [[RES_PHI]] -; -entry: - %res1 = load i8, i8* %p - call void @foo(i8 *%p) - br i1 %cmp, label %b1, label %b2 -b1: - %res2 = load i8, i8* %p - %res3 = add i8 %res1, %res2 - br label %alive -b2: - %v = load i8, i8* %p, !invariant.load !1 - %res.dead = add i8 %v, %res1 - br label %alive -alive: - %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2] - ret i8 %res.phi -} - - -; This is reduced test case catching regression in the first version of the -; fix for invariant loads (https://reviews.llvm.org/D64405). -define void @test4() { -; CHECK-LABEL: @test4( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* inttoptr (i64 8 to float*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[TMP0]], [[TMP0]] -; CHECK-NEXT: br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER:%.*]] -; CHECK: fusion.loop_header.dim.1.preheader: -; CHECK-NEXT: [[TMP2:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ] -; CHECK-NEXT: [[FUSION_INVAR_ADDRESS_DIM_0_03:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INVAR_INC3:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], i64 0, i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = fmul float [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[INVAR_INC3]] = add nuw nsw i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], 1 -; CHECK-NEXT: [[DOTPHI_TRANS_INSERT:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[INVAR_INC3]], i64 0, i64 2 -; CHECK-NEXT: [[DOTPRE]] = load float, float* [[DOTPHI_TRANS_INSERT]], align 4, !invariant.load !0 -; CHECK-NEXT: br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] -; -entry: - %0 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 0, i64 0, i64 2 - %1 = load float, float* %0, align 4 - %2 = fmul float %1, %1 - br label %fusion.loop_header.dim.1.preheader - -fusion.loop_header.dim.1.preheader: ; preds = %fusion.loop_header.dim.1.preheader, %entry - %fusion.invar_address.dim.0.03 = phi i64 [ 0, %entry ], [ %invar.inc3, %fusion.loop_header.dim.1.preheader ] - %3 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2 - %4 = load float, float* %3, align 4, !invariant.load !1 - %5 = fmul float %4, %4 - %6 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2 - %7 = load float, float* %6, align 4, !invariant.load !1 - %8 = fmul float %7, %7 - %invar.inc3 = add nuw nsw i64 %fusion.invar_address.dim.0.03, 1 - br label %fusion.loop_header.dim.1.preheader -} - -!1 = !{} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir new file mode 100644 index 0000000000000..3e42fd5b99e31 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -march=aarch64 -run-pass=legalizer %s | FileCheck %s + +# Make sure we don't lose the register bank constraints when +# artifact combining G_UNMERGE_VALUES instructions. +--- +name: test_none_none +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_none_none + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: $w0 = COPY [[COPY]](s32) + ; CHECK: $w1 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s64) = G_MERGE_VALUES %0(s32), %1 + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... +--- +name: test_gpr_none +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_gpr_none + ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1 + ; CHECK: $w0 = COPY [[COPY]](s32) + ; CHECK: $w1 = COPY [[COPY1]](s32) + %0:gpr(s32) = COPY $w0 + %1:gpr(s32) = COPY $w1 + %2:_(s64) = G_MERGE_VALUES %0(s32), %1 + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... +--- +name: test_none_gpr +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_none_gpr + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:gpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[COPY1]](s32) + ; CHECK: $w0 = COPY [[COPY2]](s32) + ; CHECK: $w1 = COPY [[COPY3]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s64) = G_MERGE_VALUES %0(s32), %1 + %3:gpr(s32), %4:gpr(s32) = G_UNMERGE_VALUES %2(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... +--- +name: test_fpr_gpr +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_fpr_gpr + ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:gpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[COPY1]](s32) + ; CHECK: $w0 = COPY [[COPY2]](s32) + ; CHECK: $w1 = COPY [[COPY3]](s32) + %0:fpr(s32) = COPY $w0 + %1:fpr(s32) = COPY $w1 + %2:_(s64) = G_MERGE_VALUES %0(s32), %1 + %3:gpr(s32), %4:gpr(s32) = G_UNMERGE_VALUES %2(s64) + $w0 = COPY %3(s32) + $w1 = COPY %4(s32) +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll new file mode 100644 index 0000000000000..b5a05974f72be --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIF Bitwise Insert if False +; +; 8-bit vectors tests + +define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bitf_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i8> %C, + %and = and <1 x i8> %neg, %B + %and1 = and <1 x i8> %C, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bitf_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i16> %C, + %and = and <1 x i16> %neg, %B + %and1 = and <1 x i16> %C, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bitf_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i32> %C, + %and = and <1 x i32> %neg, %B + %and1 = and <1 x i32> %C, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bitf_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <1 x i64> %C, + %and = and <1 x i64> %neg, %B + %and1 = and <1 x i64> %C, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bitf_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <2 x i32> %C, + %and = and <2 x i32> %neg, %B + %and1 = and <2 x i32> %C, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bitf_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <4 x i16> %C, + %and = and <4 x i16> %neg, %B + %and1 = and <4 x i16> %C, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bitf_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %neg = xor <8 x i8> %C, + %and = and <8 x i8> %neg, %B + %and1 = and <8 x i8> %C, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bitf_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <2 x i64> %C, + %and = and <2 x i64> %neg, %B + %and1 = and <2 x i64> %C, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bitf_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <4 x i32> %C, + %and = and <4 x i32> %neg, %B + %and1 = and <4 x i32> %C, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bitf_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <8 x i16> %C, + %and = and <8 x i16> %neg, %B + %and1 = and <8 x i16> %C, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bitf_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg = xor <16 x i8> %C, + %and = and <16 x i8> %neg, %B + %and1 = and <16 x i8> %C, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll new file mode 100644 index 0000000000000..f29ea22ff8dbb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; BIT Bitwise Insert if True +; +; 8-bit vectors tests + +define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) { +; CHECK-LABEL: test_bit_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i8> %C, %B + %neg = xor <1 x i8> %C, + %and1 = and <1 x i8> %neg, %A + %or = or <1 x i8> %and, %and1 + ret <1 x i8> %or +} + +; 16-bit vectors tests + +define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) { +; CHECK-LABEL: test_bit_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i16> %C, %B + %neg = xor <1 x i16> %C, + %and1 = and <1 x i16> %neg, %A + %or = or <1 x i16> %and, %and1 + ret <1 x i16> %or +} + +; 32-bit vectors tests + +define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { +; CHECK-LABEL: test_bit_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i32> %C, %B + %neg = xor <1 x i32> %C, + %and1 = and <1 x i32> %neg, %A + %or = or <1 x i32> %and, %and1 + ret <1 x i32> %or +} + +; 64-bit vectors tests + +define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) { +; CHECK-LABEL: test_bit_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <1 x i64> %C, %B + %neg = xor <1 x i64> %C, + %and1 = and <1 x i64> %neg, %A + %or = or <1 x i64> %and, %and1 + ret <1 x i64> %or +} + +define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +; CHECK-LABEL: test_bit_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <2 x i32> %C, %B + %neg = xor <2 x i32> %C, + %and1 = and <2 x i32> %neg, %A + %or = or <2 x i32> %and, %and1 + ret <2 x i32> %or +} + +define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +; CHECK-LABEL: test_bit_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <4 x i16> %C, %B + %neg = xor <4 x i16> %C, + %and1 = and <4 x i16> %neg, %A + %or = or <4 x i16> %and, %and1 + ret <4 x i16> %or +} + +define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +; CHECK-LABEL: test_bit_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %and = and <8 x i8> %C, %B + %neg = xor <8 x i8> %C, + %and1 = and <8 x i8> %neg, %A + %or = or <8 x i8> %and, %and1 + ret <8 x i8> %or +} + +; 128-bit vectors tests + +define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: test_bit_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <2 x i64> %C, %B + %neg = xor <2 x i64> %C, + %and1 = and <2 x i64> %neg, %A + %or = or <2 x i64> %and, %and1 + ret <2 x i64> %or +} + +define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_bit_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <4 x i32> %C, %B + %neg = xor <4 x i32> %C, + %and1 = and <4 x i32> %neg, %A + %or = or <4 x i32> %and, %and1 + ret <4 x i32> %or +} + +define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +; CHECK-LABEL: test_bit_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <8 x i16> %C, %B + %neg = xor <8 x i16> %C, + %and1 = and <8 x i16> %neg, %A + %or = or <8 x i16> %and, %and1 + ret <8 x i16> %or +} + +define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +; CHECK-LABEL: test_bit_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %and = and <16 x i8> %C, %B + %neg = xor <16 x i8> %C, + %and1 = and <16 x i8> %neg, %A + %or = or <16 x i8> %and, %and1 + ret <16 x i8> %or +} diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll index 464726b0d2f30..cad3fb58086d6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -9,8 +9,7 @@ define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) { ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8b, v3.8b, v2.8b ; CHECK-NEXT: dup v2.8b, v2.b[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d @@ -49,8 +48,7 @@ define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.16b, v3.16b, v2.16b ; CHECK-NEXT: dup v2.16b, v2.b[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i8 %a, %b %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d @@ -92,8 +90,7 @@ define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4h, v3.4h, v2.4h ; CHECK-NEXT: dup v2.4h, v2.h[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d @@ -107,8 +104,7 @@ define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.8h, v3.8h, v2.8h ; CHECK-NEXT: dup v2.8h, v2.h[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i16 %a, %b %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d @@ -122,8 +118,7 @@ define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.2s, v3.2s, v2.2s ; CHECK-NEXT: dup v2.2s, v2.s[0] -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d @@ -137,8 +132,7 @@ define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d @@ -151,8 +145,7 @@ define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d @@ -166,8 +159,7 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq v2.2d, v3.2d, v2.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d @@ -222,8 +214,7 @@ define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: cmeq v2.4s, v3.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cmp31 = icmp eq i32 %a, %b %e = select i1 %cmp31, <4x float> %c, <4x float> %d @@ -247,8 +238,7 @@ define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, ; CHECK-NEXT: fmov d2, x1 ; CHECK-NEXT: fmov d3, x0 ; CHECK-NEXT: cmeq d2, d3, d2 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp31 = icmp eq i64 %a, %b %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d @@ -278,8 +268,7 @@ define <2 x i32> @test_select_cc_v2i32_icmpi1(i1 %cc, <2 x i32> %a, <2 x i32> %b ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %cmp = icmp ne i1 %cc, 0 %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b @@ -294,8 +283,7 @@ define <3 x float> @test_select_cc_v3f32_fcmp_f32(<3 x float> %a, <3 x float> %b ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: fcmeq v2.4s, v2.4s, v3.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq float %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b @@ -309,8 +297,7 @@ define <3 x float> @test_select_cc_v3f32_fcmp_f64(<3 x float> %a, <3 x float> %b ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-NEXT: fcmeq v2.2d, v2.2d, v3.2d ; CHECK-NEXT: dup v2.2d, v2.d[0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %cc = fcmp oeq double %c1, %c2 %r = select i1 %cc, <3 x float> %a, <3 x float> %b diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll index 59afe47042ffb..bf049c20e6c2d 100644 --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -5,8 +5,7 @@ define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 { ; CHECK-LABEL: select_64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: %0 = bitcast <4 x half> %a to <4 x i16> @@ -23,8 +22,7 @@ entry: define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 { ; CHECK-LABEL: select_128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: %0 = bitcast <8 x half> %a to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index 4fe52e7cae249..521f7f6521bf0 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -61,8 +61,7 @@ define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: bsl8xi8_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 > %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 > @@ -74,8 +73,7 @@ define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: bsl16xi8_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 > %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -664,8 +662,7 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: bsl2xi32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x000000ffffffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <2 x i32> %a, < i32 -1, i32 0 > %tmp2 = and <2 x i32> %b, < i32 0, i32 -1 > @@ -678,8 +675,7 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: bsl4xi16_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 > %tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 > @@ -691,8 +687,7 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: bsl1xi64_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0xffffffffffffff00 -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = and <1 x i64> %a, < i64 -256 > %tmp2 = and <1 x i64> %b, < i64 255 > @@ -704,8 +699,7 @@ define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: bsl4xi32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 > %tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 > @@ -717,8 +711,7 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: bsl8xi16_const: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 > %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 > @@ -731,8 +724,7 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI75_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll index 8e54d91662775..08adbd1507220 100644 --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -480,9 +480,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v0.2d, x9 -; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %c = icmp ult <2 x i64> %x, %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> @@ -653,8 +653,8 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> ; CHECK: // %bb.0: ; CHECK-NEXT: mvn v2.16b, v1.16b ; CHECK-NEXT: cmhi v3.2d, v2.2d, v0.2d -; CHECK-NEXT: bsl v3.16b, v0.16b, v2.16b -; CHECK-NEXT: add v0.2d, v3.2d, v1.2d +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %noty = xor <2 x i64> %y, %c = icmp ult <2 x i64> %x, %noty diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll index a559b7868575f..cb1fac16aa9c1 100644 --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -71,10 +71,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v2.2s -; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 -; CHECK-NEXT: bsl v1.8b, v0.8b, v2.8b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s +; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) ret <2 x float> %1 @@ -95,10 +94,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 { ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v2.4s -; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) ret <4 x float> %1 @@ -120,21 +118,19 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 { ; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.4s, v1.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) ret <8 x float> %1 @@ -210,10 +206,9 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 { ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v2.2d -; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d +; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) ret <2 x double> %1 @@ -238,24 +233,22 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 { ; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 -; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b -; CHECK-NEXT: frsqrte v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrte v2.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d ; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0 -; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) ret <4 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll index 0e2c891816c1d..2e385fdd6f25f 100644 --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll @@ -62,8 +62,7 @@ define <4 x i32> @out_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %ma ; CHECK-LABEL: out_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, %x @@ -76,8 +75,7 @@ define <4 x i32> @in_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mas ; CHECK-LABEL: in_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, ; %x %n1 = and <4 x i32> %n0, %mask @@ -90,8 +88,7 @@ define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x ; CHECK-LABEL: out_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, %x @@ -105,8 +102,7 @@ define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i ; CHECK-LABEL: in_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> %x, ; %x @@ -169,9 +165,8 @@ define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %mask, @@ -183,9 +178,8 @@ define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %ma define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> , %y ; %x %n1 = and <4 x i32> %n0, %mask @@ -197,9 +191,8 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mas define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: out_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %mx = and <4 x i32> %notmask, @@ -212,9 +205,8 @@ define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: movi v2.4s, #42 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> , %y ; %x diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll index df86540fdd964..607f5dd3dc772 100644 --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll @@ -13,8 +13,7 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -46,8 +45,7 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -111,8 +109,7 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -128,8 +125,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: out_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -141,8 +137,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -154,8 +149,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -167,8 +161,7 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: out_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -180,8 +173,7 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -197,8 +189,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: out_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -210,8 +201,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: out_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <8 x i16> %x, %mask %notmask = xor <8 x i16> %mask, @@ -223,8 +213,7 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -236,8 +225,7 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -249,8 +237,7 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: out_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, @@ -270,8 +257,7 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwin define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: in_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i8> %x, %y %n1 = and <1 x i8> %n0, %mask @@ -286,8 +272,7 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-LABEL: in_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i8> %x, %y %n1 = and <2 x i8> %n0, %mask @@ -298,8 +283,7 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: in_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i16> %x, %y %n1 = and <1 x i16> %n0, %mask @@ -314,8 +298,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -326,8 +309,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: in_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -338,8 +320,7 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: in_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i32> %x, %y %n1 = and <1 x i32> %n0, %mask @@ -354,8 +335,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -366,8 +346,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: in_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -378,8 +357,7 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: in_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -390,8 +368,7 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: in_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %n0 = xor <1 x i64> %x, %y %n1 = and <1 x i64> %n0, %mask @@ -406,8 +383,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -418,8 +394,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: in_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -430,8 +405,7 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: in_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, %y %n1 = and <4 x i32> %n0, %mask @@ -442,8 +416,7 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: in_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index de3c1fafb6de6..3f2ec1e17894f 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -318,8 +318,8 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -351,8 +351,8 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -577,8 +577,8 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -606,8 +606,8 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -637,8 +637,8 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -670,8 +670,8 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -699,8 +699,8 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -729,8 +729,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -761,8 +761,8 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll index 5ee7c2a9aee96..85abb4d7f8303 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -104,8 +104,8 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI4_4] ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll new file mode 100644 index 0000000000000..a374369478d12 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -0,0 +1,594 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s + +define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) { +; GFX7-LABEL: s_bswap_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 +; GFX7-NEXT: s_mov_b32 s0, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_mov_b32 s0, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call i32 @llvm.bswap.i32(i32 %src) + %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap) + ret i32 %to.sgpr +} + +define i32 @v_bswap_i32(i32 %src) { +; GFX7-LABEL: v_bswap_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call i32 @llvm.bswap.i32(i32 %src) + ret i32 %bswap +} + +define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) { +; GFX7-LABEL: s_bswap_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 +; GFX7-NEXT: s_mov_b32 s0, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 8 +; GFX7-NEXT: v_alignbit_b32 v2, s1, s1, 24 +; GFX7-NEXT: v_bfi_b32 v1, s0, v2, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_mov_b32 s0, 0x10203 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x10203 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) + %bswap.0 = extractelement <2 x i32> %bswap, i32 0 + %bswap.1 = extractelement <2 x i32> %bswap, i32 1 + %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) + %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) + %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 + %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 + ret <2 x i32> %ins.1 +} + +define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { +; GFX7-LABEL: v_bswap_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) + ret <2 x i32> %bswap +} + +define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) { +; GFX7-LABEL: s_bswap_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24 +; GFX7-NEXT: s_mov_b32 s1, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24 +; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_mov_b32 s1, 0x10203 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_mov_b32 s1, 0x10203 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call i64 @llvm.bswap.i64(i64 %src) + %cast = bitcast i64 %bswap to <2 x i32> + %elt0 = extractelement <2 x i32> %cast, i32 0 + %elt1 = extractelement <2 x i32> %cast, i32 1 + %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) + %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) + %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 + %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 + ret <2 x i32> %ins.1 +} + +define i64 @v_bswap_i64(i64 %src) { +; GFX7-LABEL: v_bswap_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v2, s4, v1, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v2, 0, v1, s4 +; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v2, 0, v1, s4 +; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call i64 @llvm.bswap.i64(i64 %src) + ret i64 %bswap +} + +define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) { +; GFX7-LABEL: s_bswap_v2i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24 +; GFX7-NEXT: s_mov_b32 s1, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24 +; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1 +; GFX7-NEXT: v_alignbit_b32 v2, s3, s3, 8 +; GFX7-NEXT: v_alignbit_b32 v3, s3, s3, 24 +; GFX7-NEXT: v_bfi_b32 v2, s1, v3, v2 +; GFX7-NEXT: v_alignbit_b32 v3, s2, s2, 8 +; GFX7-NEXT: v_alignbit_b32 v4, s2, s2, 24 +; GFX7-NEXT: v_bfi_b32 v3, s1, v4, v3 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_mov_b32 s1, 0x10203 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX8-NEXT: v_perm_b32 v2, 0, v2, s1 +; GFX8-NEXT: v_perm_b32 v3, 0, v3, s1 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_mov_b32 s1, 0x10203 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX9-NEXT: v_perm_b32 v2, 0, v2, s1 +; GFX9-NEXT: v_perm_b32 v3, 0, v3, s1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) + %cast = bitcast <2 x i64> %bswap to <4 x i32> + %bswap.0 = extractelement <4 x i32> %cast, i32 0 + %bswap.1 = extractelement <4 x i32> %cast, i32 1 + %bswap.2 = extractelement <4 x i32> %cast, i32 2 + %bswap.3 = extractelement <4 x i32> %cast, i32 3 + %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) + %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) + %to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2) + %to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3) + %ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0 + %ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1 + %ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2 + %ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3 + ret <4 x i32> %ins.3 +} + +define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) { +; GFX7-LABEL: v_bswap_v2i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v4, s4, v1, v4 +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v3, 8 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; GFX7-NEXT: v_bfi_b32 v5, s4, v3, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v2, 8 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; GFX7-NEXT: v_bfi_b32 v3, s4, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v4, 0, v1, s4 +; GFX8-NEXT: v_perm_b32 v5, 0, v3, s4 +; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX8-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v4, 0, v1, s4 +; GFX9-NEXT: v_perm_b32 v5, 0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX9-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) + ret <2 x i64> %bswap +} + +define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) { +; GFX7-LABEL: s_bswap_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 +; GFX7-NEXT: s_mov_b32 s0, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_mov_b32 s0, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call i16 @llvm.bswap.i16(i16 %src) + %zext = zext i16 %bswap to i32 + %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) + %trunc = trunc i32 %to.sgpr to i16 + ret i16 %trunc +} + +define i16 @v_bswap_i16(i16 %src) { +; GFX7-LABEL: v_bswap_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + ret i16 %bswap +} + +define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { +; GFX7-LABEL: s_bswap_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8 +; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24 +; GFX7-NEXT: s_mov_b32 s0, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 8 +; GFX7-NEXT: v_alignbit_b32 v2, s1, s1, 24 +; GFX7-NEXT: v_bfi_b32 v1, s0, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_bswap_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_mov_b32 s0, 0x10203 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_bswap_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x10203 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog + %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) + %cast0 = bitcast <2 x i16> %bswap to i32 + %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0) + ret i32 %to.sgpr +} + +define i32 @v_bswap_i16_zext_to_i32(i16 %src) { +; GFX7-LABEL: v_bswap_i16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_i16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_i16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + %zext = zext i16 %bswap to i32 + ret i32 %zext +} + +define i32 @v_bswap_i16_sext_to_i32(i16 %src) { +; GFX7-LABEL: v_bswap_i16_sext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_i16_sext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_i16_sext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + %zext = sext i16 %bswap to i32 + ret i32 %zext +} + +define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { +; GFX7-LABEL: v_bswap_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; GFX7-NEXT: s_mov_b32 s4, 0xff00ff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: s_mov_b32 s4, 0x10203 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x10203 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) + ret <2 x i16> %bswap +} + +; FIXME +; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { +; %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src) +; ret <3 x i16> %bswap +; } + +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i16 @llvm.bswap.i16(i16) #1 +declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1 +declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1 +declare i32 @llvm.bswap.i32(i32) #1 +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1 +declare i64 @llvm.bswap.i64(i64) #1 +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1 + +attributes #0 = { convergent nounwind readnone } +attributes #1 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir new file mode 100644 index 0000000000000..2200618ee04e2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -0,0 +1,28 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s + +--- +name: bswap_i32_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0 + ; GFX7-LABEL: name: bswap_i32_vv + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[V_ALIGNBIT_B32_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32 [[COPY]], [[COPY]], 8, implicit $exec + ; GFX7: [[V_ALIGNBIT_B32_1:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32 [[COPY]], [[COPY]], 24, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 + ; GFX7: [[V_BFI_B32_:%[0-9]+]]:vgpr_32 = V_BFI_B32 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_1]], [[V_ALIGNBIT_B32_]], implicit $exec + ; GFX7: S_ENDPGM 0, implicit [[V_BFI_B32_]] + ; GFX8-LABEL: name: bswap_i32_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX8: [[V_PERM_B32_:%[0-9]+]]:vgpr_32 = V_PERM_B32 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_PERM_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_BSWAP %0 + S_ENDPGM 0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir index 36539926c3658..a7c4773c20d1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir @@ -129,3 +129,20 @@ body: | $vgpr0_vgpr1 = COPY %1 ... +--- +name: bswap_s64 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: bswap_s64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[UV1]] + ; CHECK: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP]](s32), [[BSWAP1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_BSWAP %0 + $vgpr0_vgpr1 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir index eb660979a9ce5..fe05c7b2ff469 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir @@ -79,3 +79,417 @@ body: | %2:_(<2 x s32>) = G_ANYEXT %1 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_fptrunc_s64_to_s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_fptrunc_s64_to_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C5]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV]] + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]] + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]] + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744 + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]] + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]] + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]] + ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]] + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]] + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096 + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32) + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]] + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]] + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]] + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]] + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]] + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]] + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039 + ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]] + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]] + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]] + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s16) = G_FPTRUNC %0 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: test_fptrunc_v2s64_to_v2s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_fptrunc_v2s64_to_v2s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]] + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]] + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]] + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744 + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]] + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]] + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]] + ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]] + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]] + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096 + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32) + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]] + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]] + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]] + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]] + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]] + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]] + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039 + ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]] + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]] + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]] + ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) + ; CHECK: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[LSHR5]], [[C1]] + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C2]] + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]] + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]] + ; CHECK: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]] + ; CHECK: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1) + ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]] + ; CHECK: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]] + ; CHECK: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]] + ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C9]](s32) + ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]] + ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND5]] + ; CHECK: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]] + ; CHECK: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]] + ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]] + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32) + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32) + ; CHECK: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]] + ; CHECK: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1) + ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]] + ; CHECK: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND5]](s32), [[C10]] + ; CHECK: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]] + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]] + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32) + ; CHECK: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]] + ; CHECK: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1) + ; CHECK: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]] + ; CHECK: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1) + ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]] + ; CHECK: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]] + ; CHECK: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND5]](s32), [[C17]] + ; CHECK: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]] + ; CHECK: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND5]](s32), [[C18]] + ; CHECK: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]] + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]] + ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]] + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C21]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR15]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C21]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32) + ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s16>) = G_FPTRUNC %0 + $vgpr0 = COPY %1 +... + +--- +name: test_fptrunc_s64_to_s16_afn +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_fptrunc_s64_to_s16_afn + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C5]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV]] + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]] + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]] + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744 + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]] + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]] + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]] + ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]] + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]] + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096 + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32) + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]] + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]] + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]] + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]] + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]] + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]] + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039 + ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]] + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]] + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]] + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s16) = G_FPTRUNC %0 + %2:_(s32) = afn G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: test_fptrunc_v2s64_to_v2s16_afn +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_fptrunc_v2s64_to_v2s16_afn + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]] + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]] + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]] + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744 + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]] + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]] + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]] + ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]] + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]] + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096 + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32) + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]] + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]] + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]] + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]] + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]] + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]] + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039 + ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]] + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]] + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]] + ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) + ; CHECK: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[LSHR5]], [[C1]] + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C2]] + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]] + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]] + ; CHECK: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]] + ; CHECK: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1) + ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]] + ; CHECK: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]] + ; CHECK: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]] + ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C9]](s32) + ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]] + ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND5]] + ; CHECK: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]] + ; CHECK: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]] + ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]] + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32) + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32) + ; CHECK: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]] + ; CHECK: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1) + ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]] + ; CHECK: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND5]](s32), [[C10]] + ; CHECK: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]] + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]] + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32) + ; CHECK: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]] + ; CHECK: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1) + ; CHECK: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]] + ; CHECK: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1) + ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]] + ; CHECK: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]] + ; CHECK: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND5]](s32), [[C17]] + ; CHECK: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]] + ; CHECK: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND5]](s32), [[C18]] + ; CHECK: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]] + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]] + ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]] + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C21]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR15]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C21]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32) + ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s16>) = afn G_FPTRUNC %0 + $vgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir index f78f17ebfe54f..30065daa3bde2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_sdiv_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir index 5402048fae983..aaf6431f2d933 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_srem_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir index f6b43a81e4e1a..3a2e294741182 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_udiv_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir index 29e2e12bdd6be..efaa4f39b1908 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_urem_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir index 9850b87959af3..818c9368ea9e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir @@ -11,7 +11,8 @@ body: | liveins: $sgpr0 ; CHECK-LABEL: name: bswap_i32_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[BSWAP:%[0-9]+]]:sgpr(s32) = G_BSWAP [[COPY]] + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[BSWAP:%[0-9]+]]:vgpr(s32) = G_BSWAP [[COPY1]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = G_BSWAP %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index daf56e41522a2..0f4c09433c1cc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -5505,8 +5505,13 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @udiv_v2i64_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = udiv <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_pow2k_denom: @@ -5516,8 +5521,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -5531,8 +5536,13 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = udiv <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: @@ -5540,7 +5550,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s4, 0xf001 +; GCN-NEXT: s_movk_i32 s6, 0xf001 ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5549,11 +5559,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_movk_i32 s0, 0xfff +; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 +; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 @@ -5571,19 +5583,17 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v6, v3, s4 -; GCN-NEXT: v_mul_lo_u32 v8, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 +; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 ; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 -; GCN-NEXT: s_movk_i32 s0, 0xfff ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 @@ -5608,6 +5618,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc @@ -5641,9 +5652,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], 12 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = udiv <2 x i64> %x, @@ -5654,8 +5664,15 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> , [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = udiv <2 x i64> [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_pow2_shl_denom: @@ -5666,10 +5683,10 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s1, s2, 12 ; GCN-NEXT: s_add_i32 s0, s0, 12 -; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s1 +; GCN-NEXT: s_add_i32 s2, s2, 12 ; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -5874,8 +5891,13 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @urem_v2i64_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = urem <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i64_pow2k_denom: @@ -5887,8 +5909,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s1, s2, s8 ; GCN-NEXT: s_and_b32 s0, s0, s8 +; GCN-NEXT: s_and_b32 s1, s2, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -5902,8 +5924,15 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> , [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = urem <2 x i64> [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i64_pow2_shl_denom: @@ -5916,14 +5945,14 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 ; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 -; GCN-NEXT: s_add_u32 s2, s2, -1 -; GCN-NEXT: s_addc_u32 s3, s3, -1 -; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] +; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 ; GCN-NEXT: s_add_u32 s0, s0, -1 ; GCN-NEXT: s_addc_u32 s1, s1, -1 ; GCN-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, -1 +; GCN-NEXT: s_addc_u32 s3, s3, -1 +; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -6249,8 +6278,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i64_pow2k_denom: @@ -6260,16 +6294,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-NEXT: s_lshr_b32 s8, s8, 20 -; GCN-NEXT: s_add_u32 s2, s2, s8 -; GCN-NEXT: s_addc_u32 s3, s3, 0 ; GCN-NEXT: s_ashr_i32 s8, s1, 31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GCN-NEXT: s_lshr_b32 s8, s8, 20 ; GCN-NEXT: s_add_u32 s0, s0, s8 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GCN-NEXT: s_lshr_b32 s8, s8, 20 +; GCN-NEXT: s_add_u32 s2, s2, s8 +; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -6283,101 +6317,112 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 +; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s6, 0xf001 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s6 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s0, s9, 31 +; GCN-NEXT: s_lshr_b32 s0, s0, 20 +; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 +; GCN-NEXT: s_add_u32 s2, s8, s0 +; GCN-NEXT: s_addc_u32 s3, s9, 0 +; GCN-NEXT: s_ashr_i32 s8, s11, 31 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v4, v2, s6 -; GCN-NEXT: v_mul_hi_u32 v6, s6, v0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 +; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 +; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 +; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[2:3] -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: s_add_u32 s0, s10, s2 +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] +; GCN-NEXT: s_add_u32 s0, s10, s8 +; GCN-NEXT: s_addc_u32 s1, s11, s8 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s1, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_movk_i32 s3, 0xfff +; GCN-NEXT: s_movk_i32 s9, 0xfff ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 -; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 ; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc ; GCN-NEXT: s_movk_i32 s0, 0xffe ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 @@ -6394,22 +6439,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: s_ashr_i32 s0, s9, 31 -; GCN-NEXT: s_lshr_b32 s0, s0, 20 -; GCN-NEXT: s_add_u32 s0, s8, s0 -; GCN-NEXT: s_addc_u32 s1, s9, 0 -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv <2 x i64> %x, @@ -6420,8 +6460,15 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> , [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: @@ -6432,8 +6479,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: s_mov_b32 s18, 0x4f800000 ; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s4 -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GCN-NEXT: s_ashr_i32 s16, s3, 31 ; GCN-NEXT: s_add_u32 s2, s2, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -6503,22 +6550,22 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: s_add_u32 s0, s10, s2 +; GCN-NEXT: s_ashr_i32 s2, s9, 31 +; GCN-NEXT: s_add_u32 s0, s8, s2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] +; GCN-NEXT: s_addc_u32 s1, s9, s2 +; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 +; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 @@ -6533,8 +6580,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, s11, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GCN-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc ; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] @@ -6548,14 +6595,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 ; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: s_ashr_i32 s10, s13, 31 +; GCN-NEXT: s_ashr_i32 s8, s13, 31 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: s_add_u32 s12, s12, s10 +; GCN-NEXT: s_add_u32 s12, s12, s8 ; GCN-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v8, s11 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_addc_u32 s13, s13, s10 -; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v8, s9 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s13, s13, s8 +; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v10, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, s13 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc @@ -6624,42 +6671,42 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: s_ashr_i32 s14, s9, 31 +; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] -; GCN-NEXT: s_add_u32 s0, s8, s14 +; GCN-NEXT: s_add_u32 s0, s10, s14 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s9, s14 -; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[14:15] +; GCN-NEXT: s_addc_u32 s1, s11, s14 +; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v7, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v9, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v10, s9, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s9, v3 +; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 +; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s9, v2 +; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 ; GCN-NEXT: v_mov_b32_e32 v8, s3 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v6, s12, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s12, v5 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s13, v5 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v8, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s12, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s12, v2 +; GCN-NEXT: v_mul_lo_u32 v6, s13, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s12, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 ; GCN-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s12, v5 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, s9, v0 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 ; GCN-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v1 +; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GCN-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] @@ -6667,30 +6714,30 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v5 -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v5 -; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1] +; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NEXT: v_subb_u32_e32 v0, vcc, v8, v0, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v0 +; GCN-NEXT: v_mov_b32_e32 v8, s11 +; GCN-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GCN-NEXT: v_xor_b32_e32 v1, s0, v1 -; GCN-NEXT: v_xor_b32_e32 v4, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v1 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN-NEXT: v_xor_b32_e32 v2, s0, v2 +; GCN-NEXT: v_xor_b32_e32 v3, s1, v3 +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -7010,8 +7057,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { ; CHECK-LABEL: @srem_v2i64_pow2k_denom( -; CHECK-NEXT: [[R:%.*]] = srem <2 x i64> [[X:%.*]], -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i64_pow2k_denom: @@ -7022,20 +7074,20 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s9, s3, 31 -; GCN-NEXT: s_lshr_b32 s9, s9, 20 -; GCN-NEXT: s_add_u32 s9, s2, s9 -; GCN-NEXT: s_addc_u32 s10, s3, 0 -; GCN-NEXT: s_and_b32 s9, s9, s8 -; GCN-NEXT: s_sub_u32 s2, s2, s9 -; GCN-NEXT: s_subb_u32 s3, s3, s10 ; GCN-NEXT: s_ashr_i32 s9, s1, 31 ; GCN-NEXT: s_lshr_b32 s9, s9, 20 ; GCN-NEXT: s_add_u32 s9, s0, s9 ; GCN-NEXT: s_addc_u32 s10, s1, 0 -; GCN-NEXT: s_and_b32 s8, s9, s8 -; GCN-NEXT: s_sub_u32 s0, s0, s8 +; GCN-NEXT: s_and_b32 s9, s9, s8 +; GCN-NEXT: s_sub_u32 s0, s0, s9 ; GCN-NEXT: s_subb_u32 s1, s1, s10 +; GCN-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-NEXT: s_lshr_b32 s9, s9, 20 +; GCN-NEXT: s_add_u32 s9, s2, s9 +; GCN-NEXT: s_addc_u32 s10, s3, 0 +; GCN-NEXT: s_and_b32 s8, s9, s8 +; GCN-NEXT: s_sub_u32 s2, s2, s8 +; GCN-NEXT: s_subb_u32 s3, s3, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -7050,8 +7102,15 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> , [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = srem <2 x i64> [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i64_pow2_shl_denom: @@ -7062,8 +7121,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: s_mov_b32 s18, 0x4f800000 ; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s4 -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GCN-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-NEXT: s_add_u32 s2, s2, s4 ; GCN-NEXT: s_mov_b32 s5, s4 @@ -7086,8 +7145,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s12, s11, 31 -; GCN-NEXT: s_add_u32 s0, s10, s12 +; GCN-NEXT: s_ashr_i32 s12, s9, 31 +; GCN-NEXT: s_add_u32 s0, s8, s12 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 ; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 @@ -7104,8 +7163,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: s_addc_u32 s1, s11, s12 -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] +; GCN-NEXT: s_addc_u32 s1, s9, s12 +; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -7140,15 +7199,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 +; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc @@ -7161,9 +7220,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_mul_lo_u32 v0, s16, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 ; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] @@ -7178,14 +7237,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: s_ashr_i32 s2, s15, 31 ; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GCN-NEXT: s_add_u32 s10, s14, s2 +; GCN-NEXT: s_add_u32 s8, s14, s2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s11, s15, s2 -; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] -; GCN-NEXT: v_cvt_f32_u32_e32 v7, s10 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, s11 +; GCN-NEXT: s_addc_u32 s9, s15, s2 +; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] +; GCN-NEXT: v_cvt_f32_u32_e32 v7, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v9, s9 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc @@ -7204,13 +7263,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: s_sub_u32 s2, 0, s10 +; GCN-NEXT: s_sub_u32 s2, 0, s8 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 ; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 -; GCN-NEXT: s_subb_u32 s3, 0, s11 +; GCN-NEXT: s_subb_u32 s3, 0, s9 ; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 -; GCN-NEXT: s_ashr_i32 s14, s9, 31 +; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 @@ -7255,68 +7314,68 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] -; GCN-NEXT: s_add_u32 s0, s8, s14 +; GCN-NEXT: s_add_u32 s0, s10, s14 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: s_addc_u32 s1, s9, s14 -; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[14:15] +; GCN-NEXT: s_addc_u32 s1, s11, s14 +; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v7, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v9, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v10, s9, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s9, v3 +; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 +; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s9, v2 +; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 ; GCN-NEXT: v_mov_b32_e32 v8, s12 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s10, v5 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s12, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s11, v5 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v8, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s10, v5 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s9, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v3, s8, v3 +; GCN-NEXT: v_mul_hi_u32 v4, s8, v2 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s8, v2 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s10, v1 +; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 ; GCN-NEXT: v_subb_u32_e64 v5, s[2:3], v4, v5, s[0:1] ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v8, s[0:1], s10, v6 +; GCN-NEXT: v_subrev_i32_e64 v8, s[0:1], s8, v6 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_subb_u32_e32 v0, vcc, v5, v0, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s11, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 -; GCN-NEXT: v_xor_b32_e32 v4, s14, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s14 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s14, v1 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 +; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 473dc6050930d..74fe04bcf3473 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -3,6 +3,9 @@ ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI declare i16 @llvm.bswap.i16(i16) nounwind readnone +declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone +declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone +declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone declare i32 @llvm.bswap.i32(i32) nounwind readnone declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone @@ -10,6 +13,7 @@ declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone declare i64 @llvm.bswap.i64(i64) nounwind readnone declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone +declare i48 @llvm.bswap.i48(i48) #1 define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; SI-LABEL: test_bswap_i32: @@ -370,9 +374,9 @@ define float @missing_truncate_promote_bswap(i32 %arg) { ; VI-LABEL: missing_truncate_promote_bswap: ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, 0x10203 +; VI-NEXT: s_mov_b32 s4, 0xc0c0001 ; VI-NEXT: v_perm_b32 v0, 0, v0, s4 -; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] bb: %tmp = trunc i32 %arg to i16 @@ -381,3 +385,197 @@ bb: %tmp3 = fpext half %tmp2 to float ret float %tmp3 } + +define i16 @v_bswap_i16(i16 %src) { +; SI-LABEL: v_bswap_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0xc0c0001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + ret i16 %bswap +} + +define i32 @v_bswap_i16_zext_to_i32(i16 %src) { +; SI-LABEL: v_bswap_i16_zext_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_i16_zext_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0xc0c0001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + %zext = zext i16 %bswap to i32 + ret i32 %zext +} + +define i32 @v_bswap_i16_sext_to_i32(i16 %src) { +; SI-LABEL: v_bswap_i16_sext_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_i16_sext_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0xc0c0001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call i16 @llvm.bswap.i16(i16 %src) + %zext = sext i16 %bswap to i32 + ret i32 %zext +} + +define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { +; SI-LABEL: v_bswap_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x2030001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) + ret <2 x i16> %bswap +} + +define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { +; SI-LABEL: v_bswap_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v4, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x2030001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: v_perm_b32 v1, 0, v1, s4 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src) + ret <3 x i16> %bswap +} + +define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { +; SI-LABEL: v_bswap_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v3, v3, 8 +; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: s_mov_b32 s5, 0xffff0000 +; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_alignbit_b32 v6, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: v_alignbit_b32 v7, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v4 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v7 +; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x2030001 +; VI-NEXT: v_perm_b32 v0, 0, v0, s4 +; VI-NEXT: v_perm_b32 v1, 0, v1, s4 +; VI-NEXT: s_setpc_b64 s[30:31] + %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src) + ret <4 x i16> %bswap +} + +define i64 @v_bswap_i48(i64 %src) { +; SI-LABEL: v_bswap_i48: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: v_bfi_b32 v2, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v1, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bswap_i48: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x10203 +; VI-NEXT: v_perm_b32 v2, 0, v0, s4 +; VI-NEXT: v_perm_b32 v0, 0, v1, s4 +; VI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i64 %src to i48 + %bswap = call i48 @llvm.bswap.i48(i48 %trunc) + %zext = zext i48 %bswap to i64 + ret i64 %zext +} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index cd6cecaa4ad7b..29f73d6b37b81 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: @@ -139,6 +140,111 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v5, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s8, s13, 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s9, s8 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s10, s0, s2 +; GCN-IR-NEXT: s_subb_u32 s11, s1, s2 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] +; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 +; GCN-IR-NEXT: s_sub_u32 s6, s0, s8 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s11 +; GCN-IR-NEXT: s_subb_u32 s7, s1, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s15, s6 +; GCN-IR-NEXT: s_add_i32 s14, s14, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 +; GCN-IR-NEXT: s_add_i32 s15, s15, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s16, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s11, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GCN-IR-NEXT: v_mov_b32_e32 v1, s16 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s15 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[12:13], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_cbranch_vccz BB0_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB0_7 +; GCN-IR-NEXT: BB0_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[10:11], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB0_6 +; GCN-IR-NEXT: BB0_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[10:11], v0 +; GCN-IR-NEXT: s_add_u32 s10, s6, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB0_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s11 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s6, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s7, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_5 +; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB0_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[8:9], s[2:3] +; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = sdiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -272,6 +378,105 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v8, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v5, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v5 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[12:13] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12 +; GCN-IR-NEXT: v_ffbh_u32_e32 v9, v13 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 32, v8 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v2, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[8:9] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v13, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v12, 0, s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 63, v8 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9] +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], v10 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[12:13], v2 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: BB1_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v16, v16, v8 +; GCN-IR-NEXT: v_or_b32_e32 v11, v15, v11 +; GCN-IR-NEXT: v_or_b32_e32 v10, v14, v10 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v12, v16 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v13, v17, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v16, vcc, v16, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v17, vcc, v17, v15, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB1_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB1_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[10:11], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v8, v0 +; GCN-IR-NEXT: BB1_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %x, %y ret i64 %result } @@ -289,23 +494,53 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 ; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 40 ; GCN-NEXT: s_xor_b32 s5, s4, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GCN-NEXT: s_ashr_i32 s4, s5, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv24_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 40 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = sdiv i64 %1, %2 @@ -319,31 +554,36 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v2, v3 -; GCN-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, -1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GCN-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv24_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 %result = sdiv i64 %1, %2 @@ -354,47 +594,56 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s7, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_add_i32 s0, s7, s4 -; GCN-NEXT: s_xor_b32 s5, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s0, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: s_xor_b32 s0, s4, s3 -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s4, s7, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv32_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s4, s7, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 32 %2 = ashr i64 %y, 32 %result = sdiv i64 %1, %2 @@ -406,49 +655,62 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 33 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s0, 31 -; GCN-NEXT: s_add_i32 s1, s2, s3 -; GCN-NEXT: s_add_i32 s0, s0, s4 -; GCN-NEXT: s_xor_b32 s5, s1, s3 -; GCN-NEXT: s_xor_b32 s2, s0, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: s_xor_b32 s0, s4, s3 -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 33 +; GCN-NEXT: s_xor_b32 s5, s4, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv31_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 33 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = sdiv i64 %1, %2 @@ -469,23 +731,53 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 ; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 41 ; GCN-NEXT: s_xor_b32 s5, s4, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GCN-NEXT: s_ashr_i32 s4, s5, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv23_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 41 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 %result = sdiv i64 %1, %2 @@ -497,49 +789,62 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 39 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s0, 31 -; GCN-NEXT: s_add_i32 s1, s2, s3 -; GCN-NEXT: s_add_i32 s0, s0, s4 -; GCN-NEXT: s_xor_b32 s5, s1, s3 -; GCN-NEXT: s_xor_b32 s2, s0, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: s_xor_b32 s0, s4, s3 -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 39 +; GCN-NEXT: s_xor_b32 s5, s4, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv25_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 39 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 39 %2 = ashr i64 %y, 39 %result = sdiv i64 %1, %2 @@ -556,10 +861,10 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[8:9], 40 -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], 40 -; GCN-NEXT: s_ashr_i64 s[8:9], s[12:13], 40 -; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-NEXT: s_ashr_i64 s[2:3], s[8:9], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[14:15], 40 +; GCN-NEXT: s_ashr_i64 s[10:11], s[12:13], 40 ; GCN-NEXT: s_xor_b32 s1, s2, s10 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s10 @@ -588,12 +893,59 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_bfe_i32 v2, v0, 0, 24 -; GCN-NEXT: v_bfe_i32 v0, v1, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-NEXT: v_bfe_i32 v2, v1, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv24_v2i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[8:9], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[14:15], 40 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[12:13], 40 +; GCN-IR-NEXT: s_xor_b32 s1, s2, s10 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s10 +; GCN-IR-NEXT: s_xor_b32 s2, s0, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s1, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-IR-NEXT: s_ashr_i32 s1, s2, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v4, v0, v4 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v5, v2, v5 +; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v6, s0 +; GCN-IR-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v7, s1 +; GCN-IR-NEXT: v_mad_f32 v0, -v4, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GCN-IR-NEXT: v_mad_f32 v2, -v5, v3, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_bfe_i32 v2, v1, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, %result = sdiv <2 x i64> %1, %2 @@ -636,6 +988,119 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv24_48: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 +; GCN-IR-NEXT: s_sext_i32_i16 s9, s6 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s9, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 24 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s10, s0, s2 +; GCN-IR-NEXT: s_subb_u32 s11, s1, s2 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 +; GCN-IR-NEXT: s_sub_u32 s8, s0, s6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s11 +; GCN-IR-NEXT: s_subb_u32 s9, s1, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s15, s8 +; GCN-IR-NEXT: s_add_i32 s14, s14, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GCN-IR-NEXT: s_add_i32 s15, s15, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s16, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s11, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GCN-IR-NEXT: v_mov_b32_e32 v1, s16 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s15 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[12:13], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_cbranch_vccz BB9_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB9_7 +; GCN-IR-NEXT: BB9_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[10:11], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB9_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB9_6 +; GCN-IR-NEXT: BB9_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[10:11], v0 +; GCN-IR-NEXT: s_add_u32 s10, s8, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB9_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s11 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s8, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s9, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB9_5 +; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB9_7: ; %udiv-end +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3] +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 %result = sdiv i48 %1, %2 @@ -769,6 +1234,93 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s8, s0, s2 +; GCN-IR-NEXT: s_subb_u32 s9, s1, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_cbranch_vccz BB10_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB10_7 +; GCN-IR-NEXT: BB10_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB10_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB10_6 +; GCN-IR-NEXT: BB10_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v0 +; GCN-IR-NEXT: s_add_u32 s7, s8, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s10, s9, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB10_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s7, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s8, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s9, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB10_5 +; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB10_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = sdiv i64 24, %x store i64 %result, i64 addrspace(1)* %out ret void @@ -892,6 +1444,91 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: s_movk_i32 s4, 0xffc5 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, s4, v4 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB11_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB11_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[14:15], 24, v4 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: BB11_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v6 +; GCN-IR-NEXT: v_or_b32_e32 v9, v13, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v12, v8 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v11, v15, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v14, vcc, v14, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v15, vcc, v15, v13, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB11_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB11_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v0 +; GCN-IR-NEXT: BB11_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 24, %x ret i64 %result } @@ -1015,6 +1652,96 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv_pow2_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_movk_i32 s4, 0xffd0 +; GCN-IR-NEXT: s_mov_b32 s10, 0x8000 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s10 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v1 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, s4, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], s[10:11], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[14:15], s[4:5], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: BB12_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8 +; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v10, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v11, v15, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v14, vcc, v14, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v15, vcc, v15, v13, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB12_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB12_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v0 +; GCN-IR-NEXT: BB12_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 32768, %x ret i64 %result } @@ -1029,6 +1756,89 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv_pow2_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v8 +; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v9 +; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v9, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v8, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB13_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[8:9], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB13_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[8:9], v0 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: BB13_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v10 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v12, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB13_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB13_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v1, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-IR-NEXT: BB13_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %x, 32768 ret i64 %result } @@ -1044,22 +1854,49 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-NEXT: s_ashr_i32 s1, s0, 30 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s1, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v3, -v1, v0, s2 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_or_b32 s0, s1, 1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v3, -v1, v0, s2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 24, %x.shr store i64 %result, i64 addrspace(1)* %out @@ -1077,21 +1914,47 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-NEXT: s_ashr_i32 s1, s0, 30 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s1, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 -; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_or_b32 s0, s1, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 %x.shr, 23423 store i64 %result, i64 addrspace(1)* %out @@ -1104,20 +1967,40 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = sdiv i64 24, %x.shr ret i64 %result @@ -1129,20 +2012,40 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv24_pow2_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = sdiv i64 32768, %x.shr ret i64 %result @@ -1158,6 +2061,25 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, s4, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = sdiv i64 %x.shr, 32768 ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 9113f6c2e6385..73da5d42e15bb 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: @@ -122,6 +123,106 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s2 +; GCN-IR-NEXT: s_add_i32 s11, s0, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_add_i32 s8, s10, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_vccz BB0_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB0_7 +; GCN-IR-NEXT: BB0_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB0_6 +; GCN-IR-NEXT: BB0_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: s_add_u32 s8, s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB0_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s9 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s8, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s2, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s3, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_5 +; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB0_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: s_mov_b32 s8, s4 +; GCN-IR-NEXT: s_mov_b32 s9, s5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -253,6 +354,110 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 +; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v9, v1 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 32, v6 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 32, v8 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN-IR-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[8:9] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 63, v8 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[8:9] +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[0:1], v6 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: BB1_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GCN-IR-NEXT: v_or_b32_e32 v16, v16, v8 +; GCN-IR-NEXT: v_or_b32_e32 v11, v15, v11 +; GCN-IR-NEXT: v_or_b32_e32 v10, v14, v10 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v12, v16 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v13, v17, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v16, vcc, v16, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v17, vcc, v17, v15, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB1_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB1_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[10:11], 1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 +; GCN-IR-NEXT: BB1_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v7 +; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v8, v7 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %x, %y ret i64 %result } @@ -270,25 +475,57 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 ; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 41 ; GCN-NEXT: s_xor_b32 s5, s4, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GCN-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: s_or_b32 s5, s5, 1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem23_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 41 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 %result = srem i64 %1, %2 @@ -309,25 +546,57 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 ; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 40 ; GCN-NEXT: s_xor_b32 s5, s4, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GCN-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 ; GCN-NEXT: s_or_b32 s5, s5, 1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem24_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 40 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = srem i64 %1, %2 @@ -350,15 +619,39 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 ; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem24_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: v_ashr_i64 v[1:2], v[2:3], 40 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v0, v1 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v4, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v5, v3, v5 +; GCN-IR-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-IR-NEXT: v_mad_f32 v3, -v5, v4, v3 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = srem i64 %1, %2 @@ -369,48 +662,66 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 39 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_ashr_i32 s1, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s0, 31 -; GCN-NEXT: s_add_i32 s2, s2, s1 -; GCN-NEXT: s_add_i32 s0, s0, s4 -; GCN-NEXT: s_xor_b32 s5, s2, s1 -; GCN-NEXT: s_xor_b32 s2, s0, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 39 +; GCN-NEXT: s_xor_b32 s5, s4, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s5, s5, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem25_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 39 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 39 %2 = ashr i64 %y, 39 %result = srem i64 %1, %2 @@ -422,48 +733,66 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 33 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_ashr_i32 s1, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s0, 31 -; GCN-NEXT: s_add_i32 s2, s2, s1 -; GCN-NEXT: s_add_i32 s0, s0, s4 -; GCN-NEXT: s_xor_b32 s5, s2, s1 -; GCN-NEXT: s_xor_b32 s2, s0, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 33 +; GCN-NEXT: s_xor_b32 s5, s4, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s5, s5, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem31_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[8:9], 33 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = srem i64 %1, %2 @@ -476,46 +805,60 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_ashr_i32 s0, s2, 31 -; GCN-NEXT: s_ashr_i32 s4, s7, 31 -; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_add_i32 s1, s7, s4 -; GCN-NEXT: s_xor_b32 s5, s2, s0 -; GCN-NEXT: s_xor_b32 s2, s1, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s4, s7, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem32_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s4, s7, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 32 %2 = ashr i64 %y, 32 %result = srem i64 %1, %2 @@ -662,6 +1005,121 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem33_64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[6:7], 31 +; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s9, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s8, s0, s2 +; GCN-IR-NEXT: s_subb_u32 s9, s1, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s8 +; GCN-IR-NEXT: s_sub_u32 s10, s10, s6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s9 +; GCN-IR-NEXT: s_subb_u32 s11, s11, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s10 +; GCN-IR-NEXT: s_add_i32 s14, s7, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 +; GCN-IR-NEXT: s_add_i32 s12, s13, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s12 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s11, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_vccz BB8_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB8_7 +; GCN-IR-NEXT: BB8_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB8_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB8_6 +; GCN-IR-NEXT: BB8_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[8:9], v0 +; GCN-IR-NEXT: s_add_u32 s6, s10, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s7, s11, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB8_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s7 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s6, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s10, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s11, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB8_5 +; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB8_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s10, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s10, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s11, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s10, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v5, s3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 31 %2 = ashr i64 %y, 31 %result = srem i64 %1, %2 @@ -706,6 +1164,127 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem24_48: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 +; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 24 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s8, s0, s2 +; GCN-IR-NEXT: s_subb_u32 s9, s1, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s8 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s9 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s6 +; GCN-IR-NEXT: s_add_i32 s14, s11, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: s_add_i32 s12, s13, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s12 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[10:11], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_cbranch_vccz BB9_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB9_7 +; GCN-IR-NEXT: BB9_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB9_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB9_6 +; GCN-IR-NEXT: BB9_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[8:9], v0 +; GCN-IR-NEXT: s_add_u32 s10, s6, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB9_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s11 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s6, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s7, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB9_5 +; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB9_7: ; %udiv-end +; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v5, s3 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 %result = srem i48 %1, %2 @@ -833,6 +1412,96 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 +; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[6:7], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s2 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_cbranch_vccz BB10_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB10_7 +; GCN-IR-NEXT: BB10_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB10_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB10_6 +; GCN-IR-NEXT: BB10_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v0 +; GCN-IR-NEXT: s_add_u32 s7, s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s8, s3, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB10_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s8 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s7, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s2, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s3, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB10_5 +; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB10_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = srem i64 24, %x store i64 %result, i64 addrspace(1)* %out ret void @@ -951,6 +1620,94 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: s_movk_i32 s4, 0xffc5 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s4, v2 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB11_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], 24, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB11_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v2 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: BB11_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v10, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB11_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB11_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: BB11_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 24, %x ret i64 %result } @@ -1069,6 +1826,99 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem_pow2_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_movk_i32 s4, 0xffd0 +; GCN-IR-NEXT: s_mov_b32 s10, 0x8000 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s10 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s4, v2 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v3, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[10:11], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: BB12_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB12_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB12_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: BB12_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 +; GCN-IR-NEXT: v_add_i32_e64 v1, s[4:5], v2, v1 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 32768, %x ret i64 %result } @@ -1085,6 +1935,92 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem_pow2_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[4:5], 32, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v4 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB13_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[0:1], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB13_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: BB13_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v9, v11, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v10, v8 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v12 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v13, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v14, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB13_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB13_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: BB13_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 15 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %x, 32768 ret i64 %result } @@ -1100,17 +2036,17 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v3, -v1, v0, s2 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 @@ -1118,6 +2054,35 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v3, -v1, v0, s2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 24, %x.shr store i64 %result, i64 addrspace(1)* %out @@ -1135,10 +2100,10 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_ashr_i32 s1, s0, 30 -; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 @@ -1153,6 +2118,35 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_srem24_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: s_movk_i32 s1, 0x5b7f +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 %x.shr, 23423 store i64 %result, i64 addrspace(1)* %out @@ -1165,22 +2159,44 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GCN-NEXT: v_mul_f32_e32 v3, s4, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v4, -v3, v1, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v3, s4, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = srem i64 24, %x.shr ret i64 %result @@ -1192,22 +2208,44 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GCN-NEXT: v_mul_f32_e32 v3, s4, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v4, -v3, v1, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v3, s4, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = srem i64 32768, %x.shr ret i64 %result @@ -1225,6 +2263,27 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_ashr_i64 v[0:1], v[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, s4, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = ashr i64 %x, 40 %result = srem i64 %x.shr, 32768 ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index c4795f1769dec..375fdc6163aa0 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: @@ -123,6 +124,95 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s2 +; GCN-IR-NEXT: s_add_i32 s11, s0, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_add_i32 s8, s10, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_vccz BB0_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB0_7 +; GCN-IR-NEXT: BB0_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB0_6 +; GCN-IR-NEXT: BB0_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: s_add_u32 s6, s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB0_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s7 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s6, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s2, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s3, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_5 +; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB0_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -241,6 +331,89 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v11, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 32, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v5 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[0:1], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v4 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: BB1_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v9, v11, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v10, v8 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB1_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB1_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 +; GCN-IR-NEXT: BB1_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v4 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, %y ret i64 %result } @@ -249,40 +422,52 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_lshr_b32 s3, s7, 8 -; GCN-NEXT: s_lshr_b32 s2, s2, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_lshr_b32 s4, s7, 8 +; GCN-NEXT: s_lshr_b32 s5, s8, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv24_64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 8 +; GCN-IR-NEXT: s_lshr_b32 s5, s8, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 %result = udiv i64 %1, %2 @@ -296,31 +481,36 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v4 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v2, v3 -; GCN-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, -1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GCN-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv24_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 %result = udiv i64 %1, %2 @@ -331,38 +521,46 @@ define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s7, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv32_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 32 %2 = lshr i64 %y, 32 %result = udiv i64 %1, %2 @@ -374,40 +572,52 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_lshr_b32 s3, s7, 1 -; GCN-NEXT: s_lshr_b32 s2, s2, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_lshr_b32 s4, s7, 1 +; GCN-NEXT: s_lshr_b32 s5, s8, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv31_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s8, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = udiv i64 %1, %2 @@ -422,24 +632,49 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_lshr_b32 s4, s7, 9 ; GCN-NEXT: s_lshr_b32 s5, s8, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv23_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s5, s8, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = udiv i64 %1, %2 @@ -570,6 +805,107 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv24_i48: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc +; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s8, 0xffff +; GCN-IR-NEXT: s_mov_b32 s9, 0xff000000 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_and_b32 s1, s3, s8 +; GCN-IR-NEXT: s_and_b32 s0, s2, s9 +; GCN-IR-NEXT: s_and_b32 s3, s7, s8 +; GCN-IR-NEXT: s_and_b32 s2, s6, s9 +; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GCN-IR-NEXT: s_add_i32 s8, s10, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 +; GCN-IR-NEXT: s_add_i32 s9, s12, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, s9 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_vccz BB7_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB7_7 +; GCN-IR-NEXT: BB7_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB7_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB7_6 +; GCN-IR-NEXT: BB7_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: s_add_u32 s6, s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB7_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s7 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s6, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s2, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s3, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB7_5 +; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB7_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 %result = udiv i48 %1, %2 @@ -689,6 +1025,84 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[2:3], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_cbranch_vccz BB8_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB8_7 +; GCN-IR-NEXT: BB8_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB8_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB8_6 +; GCN-IR-NEXT: BB8_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v0 +; GCN-IR-NEXT: s_add_u32 s3, s6, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB8_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s8 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s3, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s6, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s7, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB8_5 +; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB8_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, s2 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = udiv i64 24, %x store i64 %result, i64 addrspace(1)* %out ret void @@ -798,6 +1212,88 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 +; GCN-IR-NEXT: s_mov_b32 s10, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s10 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v2, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB9_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[10:11], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB9_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: BB9_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB9_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB9_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v6, v0 +; GCN-IR-NEXT: BB9_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result } @@ -809,6 +1305,81 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 15, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv_pow2_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB10_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB10_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v0, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: BB10_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v1, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v0, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s12, v8 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v10, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v0, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB10_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB10_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 +; GCN-IR-NEXT: BB10_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, 32768 ret i64 %result } @@ -921,6 +1492,81 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, s7 +; GCN-IR-NEXT: s_add_i32 s3, s0, 32 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 59, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[2:3], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN-IR-NEXT: s_cbranch_vccz BB11_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB11_7 +; GCN-IR-NEXT: BB11_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB11_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB11_6 +; GCN-IR-NEXT: BB11_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB11_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], 23, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], 0, v9, s[0:1] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 24, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v6 +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v9, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB11_5 +; GCN-IR-NEXT: BB11_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB11_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, 24 store i64 %result, i64 addrspace(1)* %out ret void @@ -1027,6 +1673,79 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 59, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB12_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v0, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: BB12_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v1, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v0, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 23, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v0, s[4:5], 0, v9, s[4:5] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 24, v0 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v0 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB12_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB12_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 +; GCN-IR-NEXT: BB12_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %x, 24 ret i64 %result } @@ -1038,35 +1757,45 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s2, s3, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_gt_u32_e64 s[0:1], 25, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, 24, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] +; GCN-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = udiv i64 24, %x.shr store i64 %result, i64 addrspace(1)* %out @@ -1080,37 +1809,43 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_movk_i32 s2, 0x5b7f -; GCN-NEXT: s_movk_i32 s8, 0x5b7e -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s3, s3, 8 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] +; GCN-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_udiv24_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = udiv i64 %x.shr, 23423 store i64 %result, i64 addrspace(1)* %out @@ -1122,31 +1857,35 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GCN-NEXT: s_mov_b32 s4, 0x41c00000 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s4 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2 -; GCN-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v1 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 25, v2 -; GCN-NEXT: v_sub_i32_e64 v2, s[4:5], 24, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s4 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 24, %x.shr ret i64 %result @@ -1157,32 +1896,35 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 -; GCN-NEXT: s_mov_b32 s6, 0x8001 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GCN-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s4 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2 -; GCN-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v1, 17, v1 -; GCN-NEXT: v_mul_u32_u24_e32 v2, v1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v1 -; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s6, v2 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GCN-NEXT: s_and_b64 vcc, vcc, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[4:5] +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s4 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 32768, %x.shr ret i64 %result @@ -1195,6 +1937,22 @@ define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 23, v1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_udiv24_pow2_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38000000, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 %x.shr, 32768 ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 220f6ad57ddef..845d862eb0db5 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: @@ -122,6 +123,106 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s2 +; GCN-IR-NEXT: s_add_i32 s11, s0, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_add_i32 s8, s10, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_vccz BB0_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB0_7 +; GCN-IR-NEXT: BB0_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB0_6 +; GCN-IR-NEXT: BB0_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: s_add_u32 s8, s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB0_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s9 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s8, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s2, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s3, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB0_5 +; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB0_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: s_mov_b32 s8, s4 +; GCN-IR-NEXT: s_mov_b32 s9, s5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -239,6 +340,95 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 32, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN-IR-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v4, v5 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[0:1], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB1_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v4 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: BB1_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v6 +; GCN-IR-NEXT: v_or_b32_e32 v9, v13, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v12, v8 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v11, v15, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v14, vcc, v14, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v15, vcc, v15, v13, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB1_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB1_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: BB1_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v5 +; GCN-IR-NEXT: v_mul_hi_u32 v6, v2, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v6, v5 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %x, %y ret i64 %result } @@ -247,40 +437,56 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_lshr_b32 s3, s7, 1 -; GCN-NEXT: s_lshr_b32 s4, s2, 1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_lshr_b32 s4, s7, 1 +; GCN-NEXT: s_lshr_b32 s5, s8, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem31_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s8, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = urem i64 %1, %2 @@ -291,66 +497,87 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_brev_b32 s0, -2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s13, 1 -; GCN-NEXT: s_lshr_b32 s6, s15, 1 -; GCN-NEXT: s_lshr_b32 s12, s5, 1 -; GCN-NEXT: s_lshr_b32 s5, s7, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, s12 -; GCN-NEXT: v_mul_lo_u32 v6, v2, s12 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v7, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v3, v3, v0 -; GCN-NEXT: v_mul_hi_u32 v4, v4, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s6 -; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s5, v3 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v3 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s12, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], s4, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v4 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GCN-NEXT: s_and_b64 vcc, s[6:7], s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v6, v3, s[4:5] +; GCN-NEXT: s_lshr_b32 s1, s7, 1 +; GCN-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-NEXT: s_lshr_b32 s3, s11, 1 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_mul_f32_e32 v5, v0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, v3, v6 +; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_trunc_f32_e32 v6, v6 +; GCN-NEXT: v_mad_f32 v0, -v5, v2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: v_mad_f32 v3, -v6, v4, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem31_v2i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_brev_b32 s0, -2 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s1, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s3, s11, 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-IR-NEXT: v_mul_f32_e32 v5, v0, v5 +; GCN-IR-NEXT: v_mul_f32_e32 v6, v3, v6 +; GCN-IR-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-IR-NEXT: v_trunc_f32_e32 v6, v6 +; GCN-IR-NEXT: v_mad_f32 v0, -v5, v2, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-IR-NEXT: v_mad_f32 v3, -v6, v4, v3 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, %result = urem <2 x i64> %1, %2 @@ -362,40 +589,56 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: s_lshr_b32 s3, s7, 8 -; GCN-NEXT: s_lshr_b32 s4, s2, 8 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_lshr_b32 s4, s7, 8 +; GCN-NEXT: s_lshr_b32 s5, s8, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem24_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 8 +; GCN-IR-NEXT: s_lshr_b32 s5, s8, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 40 %2 = lshr i64 %y, 40 %result = urem i64 %1, %2 @@ -406,55 +649,87 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_brev_b32 s0, -2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s11, 9 -; GCN-NEXT: s_lshr_b32 s3, s9, 1 -; GCN-NEXT: s_lshr_b32 s8, s15, 9 -; GCN-NEXT: s_lshr_b32 s9, s13, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v0, s9 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s9 -; GCN-NEXT: v_mad_f32 v2, -v4, v3, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v6 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3| -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v2, v2, s8 -; GCN-NEXT: v_mul_hi_u32 v3, v3, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v3, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s9 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s3, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s9, v3 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v3 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s9, v3 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GCN-NEXT: s_lshr_b32 s1, s7, 9 +; GCN-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-NEXT: s_lshr_b32 s3, s11, 9 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_mul_f32_e32 v5, v0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, v3, v6 +; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_trunc_f32_e32 v6, v6 +; GCN-NEXT: v_mad_f32 v0, -v5, v2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: v_mad_f32 v3, -v6, v4, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem23_64_v2i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_brev_b32 s0, -2 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s1, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s2, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s3, s11, 9 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-IR-NEXT: v_mul_f32_e32 v5, v0, v5 +; GCN-IR-NEXT: v_mul_f32_e32 v6, v3, v6 +; GCN-IR-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-IR-NEXT: v_trunc_f32_e32 v6, v6 +; GCN-IR-NEXT: v_mad_f32 v0, -v5, v2, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-IR-NEXT: v_mad_f32 v3, -v6, v4, v3 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 +; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, %result = urem <2 x i64> %1, %2 @@ -573,6 +848,93 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc5, v0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[2:3], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_cbranch_vccz BB6_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB6_7 +; GCN-IR-NEXT: BB6_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB6_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB6_6 +; GCN-IR-NEXT: BB6_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v0 +; GCN-IR-NEXT: s_add_u32 s3, s6, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB6_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v10, s8 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], s3, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v7, s6, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, s7, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB6_5 +; GCN-IR-NEXT: BB6_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB6_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %result = urem i64 24, %x store i64 %result, i64 addrspace(1)* %out ret void @@ -684,6 +1046,90 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, s7 +; GCN-IR-NEXT: s_add_i32 s3, s0, 32 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 59, v0 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[2:3], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[0:1], vcc +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[2:3] +; GCN-IR-NEXT: s_cbranch_vccz BB7_2 +; GCN-IR-NEXT: ; %bb.1: +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GCN-IR-NEXT: s_branch BB7_7 +; GCN-IR-NEXT: BB7_2: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[6:7], v2 +; GCN-IR-NEXT: s_cbranch_vccz BB7_4 +; GCN-IR-NEXT: ; %bb.3: +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_branch BB7_6 +; GCN-IR-NEXT: BB7_4: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[6:7], v0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: BB7_5: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[0:1], 23, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v2, s[0:1], 0, v9, s[0:1] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 24, v6 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[0:1], v8, v6 +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc +; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v9, s[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: s_cbranch_vccz BB7_5 +; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-IR-NEXT: BB7_7: ; %udiv-end +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, 24 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, 24 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s7 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, 24 store i64 %result, i64 addrspace(1)* %out ret void @@ -793,6 +1239,94 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem_pow2_k_num_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 +; GCN-IR-NEXT: s_mov_b32 s10, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s10 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB8_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[10:11], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB8_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: BB8_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB8_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB8_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: BB8_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 +; GCN-IR-NEXT: v_add_i32_e64 v1, s[4:5], v2, v1 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = urem i64 32768, %x ret i64 %result } @@ -804,6 +1338,82 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem_pow2_k_den_i64: +; GCN-IR: ; %bb.0: ; %_udiv-special-cases +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v2 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB9_6 +; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_execz BB9_5 +; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: BB9_3: ; %udiv-do-while +; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v3, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v10 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v12, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_cbranch_execnz BB9_3 +; GCN-IR-NEXT: ; %bb.4: ; %Flow +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: BB9_5: ; %Flow1 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: BB9_6: ; %Flow2 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 15 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %x, 32768 ret i64 %result } @@ -815,35 +1425,49 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s8, s3, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s8 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, 24, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v1 -; GCN-NEXT: v_cmp_gt_u32_e64 s[0:1], 25, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GCN-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = urem i64 24, %x.shr store i64 %result, i64 addrspace(1)* %out @@ -856,38 +1480,50 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_movk_i32 s2, 0x5b7f -; GCN-NEXT: s_movk_i32 s8, 0x5b7e -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s3, s3, 8 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_hi_u32 v1, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 +; GCN-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s8, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0xffffa481, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm +; +; GCN-IR-LABEL: s_test_urem24_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_movk_i32 s2, 0x5b7f +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_lshr_b32 s0, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = urem i64 %x.shr, 23423 store i64 %result, i64 addrspace(1)* %out @@ -899,31 +1535,39 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2 -; GCN-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v2, v0 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 25, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; GCN-NEXT: v_sub_i32_e64 v0, s[6:7], v2, v0 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem24_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = urem i64 24, %x.shr ret i64 %result @@ -934,32 +1578,39 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 -; GCN-NEXT: s_mov_b32 s6, 0x8001 +; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2 -; GCN-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v1, 17, v1 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v2, v0 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 -; GCN-NEXT: v_sub_i32_e64 v0, s[6:7], v2, v0 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem24_pow2_k_num_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = urem i64 32768, %x.shr ret i64 %result @@ -972,6 +1623,24 @@ define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) { ; GCN-NEXT: v_bfe_u32 v0, v1, 8, 15 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IR-LABEL: v_test_urem24_pow2_k_den_i64: +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v1, -v2, s4, v1 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, s4 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = urem i64 %x.shr, 32768 ret i64 %result diff --git a/llvm/test/CodeGen/Hexagon/swp-sigma.ll b/llvm/test/CodeGen/Hexagon/swp-sigma.ll index 3ab88b8d84631..1651742820998 100644 --- a/llvm/test/CodeGen/Hexagon/swp-sigma.ll +++ b/llvm/test/CodeGen/Hexagon/swp-sigma.ll @@ -2,28 +2,11 @@ ; We do not pipeline sigma yet, but the non-pipelined version ; with good scheduling is pretty fast. The compiler generates -; 19 packets, and the assembly version is 16. +; 18 packets, and the assembly version is 16. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } -; CHECK: } +; CHECK-COUNT-17: } ; CHECK: }{{[ \t]*}}:endloop @g0 = external constant [10 x i16], align 128 diff --git a/llvm/test/CodeGen/Hexagon/vect-regpairs.ll b/llvm/test/CodeGen/Hexagon/vect-regpairs.ll new file mode 100644 index 0000000000000..4d505fc2f4eef --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/vect-regpairs.ll @@ -0,0 +1,134 @@ +;RUN: llc -march=hexagon -mcpu=hexagonv66 -mhvx -filetype=obj < %s -o - | llvm-objdump -mv66 -mhvx -d - | FileCheck --check-prefix=CHECK-V66 %s +;RUN: llc -march=hexagon -mcpu=hexagonv67 -mhvx -filetype=obj < %s -o - | llvm-objdump -mv67 -mhvx -d - | FileCheck --check-prefix=CHECK-V67 %s + +; Should not attempt to use v: 'reverse' vector regpairs +; on old or new arches (should not crash). + +; CHECK-V66: vcombine +; CHECK-V67: vcombine +declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.hexagon.V6.vd0() +declare <32 x i32> @llvm.hexagon.V6.vmpybus(<16 x i32>, i32) +declare <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32>, <32 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) +declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32 ) +declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) +declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32 ) +declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.hexagon.V6.vmpyihb.acc(<16 x i32>, <16 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.vasrhubrndsat(<16 x i32>, <16 x i32>, i32) + +declare <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32>, <16 x i32>) +declare <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32>, <16 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.vmpyiwb.acc(<16 x i32>, <16 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.vshuffob(<16 x i32>, <16 x i32>) + + +define void @Gaussian7x7u8PerRow(i8* %src, i32 %stride, i32 %width, i8* %dst) #0 { +entry: + %mul = mul i32 %stride, 3 + %idx.neg = sub i32 0, %mul + %add.ptr = getelementptr i8, i8* %src, i32 %idx.neg + bitcast i8* %add.ptr to <16 x i32>* + %mul1 = shl i32 %stride, 1 + %idx.neg2 = sub i32 0, %mul1 + %add.ptr3 = getelementptr i8, i8* %src, i32 %idx.neg2 + bitcast i8* %add.ptr3 to <16 x i32>* + %idx.neg5 = sub i32 0, %stride + %add.ptr6 = getelementptr i8, i8* %src, i32 %idx.neg5 + bitcast i8* %add.ptr6 to <16 x i32>* + bitcast i8* %src to <16 x i32>* + %add.ptr10 = getelementptr i8, i8* %src, i32 %stride + bitcast i8* %add.ptr10 to <16 x i32>* + %add.ptr12 = getelementptr i8, i8* %src, i32 %mul1 + bitcast i8* %add.ptr12 to <16 x i32>* + %add.ptr14 = getelementptr i8, i8* %src, i32 %mul + bitcast i8* %add.ptr14 to <16 x i32>* + bitcast i8* %dst to <16 x i32>* + load <16 x i32>, <16 x i32>* %0load <16 x i32>, <16 x i32>* %1load <16 x i32>, <16 x i32>* %2load <16 x i32>, <16 x i32>* %3load <16 x i32>, <16 x i32>* %4load <16 x i32>, <16 x i32>* %5load <16 x i32>, <16 x i32>* %6call <16 x i32> @llvm.hexagon.V6.vd0() + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %15, <16 x i32> %15) + call <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32> %14, <16 x i32> %8) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %13, <16 x i32> %9) + call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %17, <32 x i32> %18, i32 101058054) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %12, <16 x i32> %10) + call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %19, <32 x i32> %20, i32 252645135) + call <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32> %21, <16 x i32> %11, i32 336860180) + %cmp155 = icmp sgt i32 %width, 64 + br i1 %cmp155, label %for.body.preheader, label %for.end +for.body.preheader: %incdec.ptr20 = getelementptr i8, i8* %add.ptr14%23 = bitcast i8* %incdec.ptr20 to <16 x i32>* + %incdec.ptr19 = getelementptr i8, i8* %add.ptr12%24 = bitcast i8* %incdec.ptr19 to <16 x i32>* + %incdec.ptr18 = getelementptr i8, i8* %add.ptr10%25 = bitcast i8* %incdec.ptr18 to <16 x i32>* + %incdec.ptr17 = getelementptr i8, i8* %src%26 = bitcast i8* %incdec.ptr17 to <16 x i32>* + %incdec.ptr16 = getelementptr i8, i8* %add.ptr6%27 = bitcast i8* %incdec.ptr16 to <16 x i32>* + %incdec.ptr15 = getelementptr i8, i8* %add.ptr3%28 = bitcast i8* %incdec.ptr15 to <16 x i32>* + %incdec.ptr = getelementptr i8, i8* %add.ptr%29 = bitcast i8* %incdec.ptr to <16 x i32>* + br label %for.body +for.body: %optr.0166 = phi <16 x i32>* [ %incdec.ptr28, %for.body ], [ %7, %for.body.preheader ] + %iptr6.0165 = phi <16 x i32>* [ %incdec.ptr27, %for.body ], [ %23, %for.body.preheader ] + %iptr5.0164 = phi <16 x i32>* [ %incdec.ptr26, %for.body ], [ %24, %for.body.preheader ] + %iptr4.0163 = phi <16 x i32>* [ %incdec.ptr25, %for.body ], [ %25, %for.body.preheader ] + %iptr3.0162 = phi <16 x i32>* [ %incdec.ptr24, %for.body ], [ %26, %for.body.preheader ] + %iptr2.0161 = phi <16 x i32>* [ %incdec.ptr23, %for.body ], [ %27, %for.body.preheader ] + %iptr1.0160 = phi <16 x i32>* [ %incdec.ptr22, %for.body ], [ %28, %for.body.preheader ] + %iptr0.0159 = phi <16 x i32>* [ %incdec.ptr21, %for.body ], [ %29, %for.body.preheader ] + %dXV1.0158 = phi <32 x i32> [ %49, %for.body ], [ %22, %for.body.preheader ] + %dXV0.0157 = phi <32 x i32> [ %dXV1.0158, %for.body ], [ %16, %for.body.preheader ] + %i.0156 = phi i32 [ %sub, %for.body ], [ %width, %for.body.preheader ] + %incdec.ptr21 = getelementptr <16 x i32>, <16 x i32>* %iptr0.0159%30 = load <16 x i32>, <16 x i32>* %iptr0.0159%incdec.ptr22 = getelementptr <16 x i32>, <16 x i32>* %iptr1.0160%31 = load <16 x i32>, <16 x i32>* %iptr1.0160%incdec.ptr23 = getelementptr <16 x i32>, <16 x i32>* %iptr2.0161%32 = load <16 x i32>, <16 x i32>* %iptr2.0161%incdec.ptr24 = getelementptr <16 x i32>, <16 x i32>* %iptr3.0162%33 = load <16 x i32>, <16 x i32>* %iptr3.0162%incdec.ptr25 = getelementptr <16 x i32>, <16 x i32>* %iptr4.0163%34 = load <16 x i32>, <16 x i32>* %iptr4.0163%incdec.ptr26 = getelementptr <16 x i32>, <16 x i32>* %iptr5.0164%35 = load <16 x i32>, <16 x i32>* %iptr5.0164%incdec.ptr27 = getelementptr <16 x i32>, <16 x i32>* %iptr6.0165%36 = load <16 x i32>, <16 x i32>* %iptr6.0165, !tbaa !8 + call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %dXV1.0158) + call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %dXV0.0157) + call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %37, <16 x i32> %38, i32 2) + call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %dXV1.0158) + call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %dXV0.0157) + call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %40, <16 x i32> %41, i32 2) + call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %37, <16 x i32> %38, i32 4) + call <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32> %36, <16 x i32> %30) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %35, <16 x i32> %31) + call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %44, <32 x i32> %45, i32 101058054) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %34, <16 x i32> %32) + call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %46, <32 x i32> %47, i32 252645135) + call <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32> %48, <16 x i32> %33, i32 336860180) + call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %49) + call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %50, <16 x i32> %40, i32 2) + call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %49) + call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %52, <16 x i32> %37, i32 2) + call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %50, <16 x i32> %40, i32 4) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %37, <16 x i32> %39) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %55, <16 x i32> %40) + call <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32> %56, i32 252972820) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %51, <16 x i32> %40) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %58, <16 x i32> %37) + call <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32> %59, i32 252972820) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %53, <16 x i32> %43) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %51, <16 x i32> %42) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %61, <16 x i32> %62) + call <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32> %57, <32 x i32> %63, i32 17170694) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %54, <16 x i32> %42) + call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %53, <16 x i32> %39) + call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %65, <16 x i32> %66) + call <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32> %60, <32 x i32> %67, i32 17170694) + call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %64) + call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %64) + call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %69, <16 x i32> %70, i32 12) + call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %68) + call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %68) + call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %72, <16 x i32> %73, i32 12) + call <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32> %74, <16 x i32> %71) + %incdec.ptr28 = getelementptr <16 x i32>, <16 x i32>* %1 + store <16 x i32> %75, <16 x i32>* %optr.0166%sub = add i32 %i.0156, -64 + %cmp = icmp sgt i32 %sub, 64 + br i1 %cmp, label %for.body, label %for.end +for.end: ret void +} +declare <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32>, i32) +declare <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32>, <32 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32>, <16 x i32>, i32) +declare <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32>, <16 x i32>) + +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math""target-cpu"="hexagonv65" "target-features"="+hvx-length64b,+hvxv65,+v65,-long-calls" "unsafe-fp-math"} +!8 = !{!9, !9, i64 0} +!9 = !{!"omnipotent char", !10} +!10 = !{} +!14 = !{} +!19 = !{} +!24 = !{} diff --git a/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll b/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll index acb85fd171f54..43a26c123e78f 100644 --- a/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll +++ b/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll @@ -1,4 +1,5 @@ ; RUN: llc -fast-isel -fast-isel-abort=1 -asm-verbose=false -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s +; RUN: llc -fast-isel -fast-isel-abort=1 -asm-verbose=false -mtriple=x86_64-unknown-unknown -mattr=+avx512vl < %s | FileCheck %s ; Verify that fast-isel correctly expands float-half conversions. @@ -14,7 +15,7 @@ entry: define float @test_fp16_to_fp32(i32 %a) { ; CHECK-LABEL: test_fp16_to_fp32: -; CHECK: movswl %di, %eax +; CHECK: movzwl %di, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll index 65befee085c03..0b6fb97ef913d 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -1,57 +1,257 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC - -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC - -declare float @llvm.sqrt.f32(float) #0 -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0 -declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0 - -define float @foo_x1(float %f) #0 { -; SCALAR-EST-LABEL: foo_x1: -; SCALAR-EST: # %bb.0: -; SCALAR-EST-NEXT: rsqrtss %xmm0 -; SCALAR-EST: retq -; -; SCALAR-ACC-LABEL: foo_x1: -; SCALAR-ACC: # %bb.0: -; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}} -; SCALAR-ACC-NEXT: retq - %call = tail call float @llvm.sqrt.f32(float %f) #1 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem | FileCheck %s --check-prefixes=NHM +; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB +; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s --check-prefixes=FAST-SCALAR,BDW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,SKL + +define float @f32_no_daz(float %f) #0 { +; NHM-LABEL: f32_no_daz: +; NHM: # %bb.0: +; NHM-NEXT: rsqrtss %xmm0, %xmm1 +; NHM-NEXT: movaps %xmm0, %xmm2 +; NHM-NEXT: mulss %xmm1, %xmm2 +; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NHM-NEXT: mulss %xmm2, %xmm3 +; NHM-NEXT: mulss %xmm1, %xmm2 +; NHM-NEXT: addss {{.*}}(%rip), %xmm2 +; NHM-NEXT: andps {{.*}}(%rip), %xmm0 +; NHM-NEXT: mulss %xmm3, %xmm2 +; NHM-NEXT: cmpltss {{.*}}(%rip), %xmm0 +; NHM-NEXT: andnps %xmm2, %xmm0 +; NHM-NEXT: retq +; +; FAST-SCALAR-LABEL: f32_no_daz: +; FAST-SCALAR: # %bb.0: +; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; FAST-SCALAR-NEXT: retq + %call = tail call fast float @llvm.sqrt.f32(float %f) #2 + ret float %call +} + +define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 { +; NHM-LABEL: v4f32_no_daz: +; NHM: # %bb.0: +; NHM-NEXT: rsqrtps %xmm0, %xmm2 +; NHM-NEXT: movaps %xmm0, %xmm1 +; NHM-NEXT: mulps %xmm2, %xmm1 +; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: mulps %xmm1, %xmm3 +; NHM-NEXT: mulps %xmm2, %xmm1 +; NHM-NEXT: addps {{.*}}(%rip), %xmm1 +; NHM-NEXT: andps {{.*}}(%rip), %xmm0 +; NHM-NEXT: mulps %xmm3, %xmm1 +; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; NHM-NEXT: cmpleps %xmm0, %xmm2 +; NHM-NEXT: andps %xmm2, %xmm1 +; NHM-NEXT: movaps %xmm1, %xmm0 +; NHM-NEXT: retq +; +; SNB-LABEL: v4f32_no_daz: +; SNB: # %bb.0: +; SNB-NEXT: vrsqrtps %xmm0, %xmm1 +; SNB-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; SNB-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 +; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; SNB-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 +; SNB-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 +; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 +; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 +; SNB-NEXT: retq +; +; BDW-LABEL: v4f32_no_daz: +; BDW: # %bb.0: +; BDW-NEXT: vrsqrtps %xmm0, %xmm1 +; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; BDW-NEXT: vandps %xmm2, %xmm0, %xmm0 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; BDW-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 +; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0 +; BDW-NEXT: retq +; +; SKL-LABEL: v4f32_no_daz: +; SKL: # %bb.0: +; SKL-NEXT: vsqrtps %xmm0, %xmm0 +; SKL-NEXT: retq + %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2 + ret <4 x float> %call +} + +define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 { +; NHM-LABEL: v8f32_no_daz: +; NHM: # %bb.0: +; NHM-NEXT: sqrtps %xmm0, %xmm0 +; NHM-NEXT: sqrtps %xmm1, %xmm1 +; NHM-NEXT: retq +; +; SNB-LABEL: v8f32_no_daz: +; SNB: # %bb.0: +; SNB-NEXT: vrsqrtps %ymm0, %ymm1 +; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2 +; SNB-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm3 +; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; SNB-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm1 +; SNB-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 +; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 +; SNB-NEXT: retq +; +; BDW-LABEL: v8f32_no_daz: +; BDW: # %bb.0: +; BDW-NEXT: vrsqrtps %ymm0, %ymm1 +; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; BDW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; BDW-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 +; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0 +; BDW-NEXT: retq +; +; SKL-LABEL: v8f32_no_daz: +; SKL: # %bb.0: +; SKL-NEXT: vsqrtps %ymm0, %ymm0 +; SKL-NEXT: retq + %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2 + ret <8 x float> %call +} + +; Repeat all tests with denorms-as-zero enabled. + +define float @f32_daz(float %f) #1 { +; NHM-LABEL: f32_daz: +; NHM: # %bb.0: +; NHM-NEXT: rsqrtss %xmm0, %xmm1 +; NHM-NEXT: movaps %xmm0, %xmm2 +; NHM-NEXT: mulss %xmm1, %xmm2 +; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NHM-NEXT: mulss %xmm2, %xmm3 +; NHM-NEXT: mulss %xmm1, %xmm2 +; NHM-NEXT: addss {{.*}}(%rip), %xmm2 +; NHM-NEXT: mulss %xmm3, %xmm2 +; NHM-NEXT: xorps %xmm1, %xmm1 +; NHM-NEXT: cmpeqss %xmm1, %xmm0 +; NHM-NEXT: andnps %xmm2, %xmm0 +; NHM-NEXT: retq +; +; FAST-SCALAR-LABEL: f32_daz: +; FAST-SCALAR: # %bb.0: +; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; FAST-SCALAR-NEXT: retq + %call = tail call fast float @llvm.sqrt.f32(float %f) #2 ret float %call } -define <4 x float> @foo_x4(<4 x float> %f) #0 { -; VECTOR-EST-LABEL: foo_x4: -; VECTOR-EST: # %bb.0: -; VECTOR-EST-NEXT: rsqrtps %xmm0 -; VECTOR-EST: retq -; -; VECTOR-ACC-LABEL: foo_x4: -; VECTOR-ACC: # %bb.0: -; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}} -; VECTOR-ACC-NEXT: retq - %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1 +define <4 x float> @v4f32_daz(<4 x float> %f) #1 { +; NHM-LABEL: v4f32_daz: +; NHM: # %bb.0: +; NHM-NEXT: rsqrtps %xmm0, %xmm1 +; NHM-NEXT: movaps %xmm0, %xmm2 +; NHM-NEXT: mulps %xmm1, %xmm2 +; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: mulps %xmm2, %xmm3 +; NHM-NEXT: mulps %xmm1, %xmm2 +; NHM-NEXT: addps {{.*}}(%rip), %xmm2 +; NHM-NEXT: mulps %xmm3, %xmm2 +; NHM-NEXT: xorps %xmm1, %xmm1 +; NHM-NEXT: cmpneqps %xmm1, %xmm0 +; NHM-NEXT: andps %xmm2, %xmm0 +; NHM-NEXT: retq +; +; SNB-LABEL: v4f32_daz: +; SNB: # %bb.0: +; SNB-NEXT: vrsqrtps %xmm0, %xmm1 +; SNB-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; SNB-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 +; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; SNB-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 +; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 +; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; SNB-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0 +; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 +; SNB-NEXT: retq +; +; BDW-LABEL: v4f32_daz: +; BDW: # %bb.0: +; BDW-NEXT: vrsqrtps %xmm0, %xmm1 +; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 +; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; BDW-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0 +; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0 +; BDW-NEXT: retq +; +; SKL-LABEL: v4f32_daz: +; SKL: # %bb.0: +; SKL-NEXT: vsqrtps %xmm0, %xmm0 +; SKL-NEXT: retq + %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2 ret <4 x float> %call } -define <8 x float> @foo_x8(<8 x float> %f) #0 { -; VECTOR-EST-LABEL: foo_x8: -; VECTOR-EST: # %bb.0: -; VECTOR-EST-NEXT: rsqrtps -; VECTOR-EST: retq -; -; VECTOR-ACC-LABEL: foo_x8: -; VECTOR-ACC: # %bb.0: -; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}} -; VECTOR-ACC-NOT: rsqrt -; VECTOR-ACC: retq - %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1 +define <8 x float> @v8f32_daz(<8 x float> %f) #1 { +; NHM-LABEL: v8f32_daz: +; NHM: # %bb.0: +; NHM-NEXT: sqrtps %xmm0, %xmm0 +; NHM-NEXT: sqrtps %xmm1, %xmm1 +; NHM-NEXT: retq +; +; SNB-LABEL: v8f32_daz: +; SNB: # %bb.0: +; SNB-NEXT: vrsqrtps %ymm0, %ymm1 +; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2 +; SNB-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm3 +; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; SNB-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm1 +; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; SNB-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 +; SNB-NEXT: retq +; +; BDW-LABEL: v8f32_daz: +; BDW: # %bb.0: +; BDW-NEXT: vrsqrtps %ymm0, %ymm1 +; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 +; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; BDW-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0 +; BDW-NEXT: retq +; +; SKL-LABEL: v8f32_daz: +; SKL: # %bb.0: +; SKL-NEXT: vsqrtps %ymm0, %ymm0 +; SKL-NEXT: retq + %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2 ret <8 x float> %call } -attributes #0 = { "unsafe-fp-math"="true" } -attributes #1 = { nounwind readnone } +declare float @llvm.sqrt.f32(float) #2 +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2 +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #2 + +attributes #0 = { "denormal-fp-math"="ieee,ieee" } +attributes #1 = { "denormal-fp-math"="ieee,preserve-sign" } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s b/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s new file mode 100644 index 0000000000000..1988f90dc56e3 --- /dev/null +++ b/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s @@ -0,0 +1,15 @@ +# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv67 -mhvx -filetype=asm %s 2>%t; FileCheck --implicit-check-not="error:" %s <%t + +{ + v1:0 = #0 + v0:1 = #0 +} +# CHECK: error: register `V1' modified more than once + +## Unused .tmp: +{ + v1.tmp = vmem(r0 + #3) + v0:1 = vaddw(v17:16, v17:16) +} + +# CHECK: warning: register `V1' used with `.tmp' but not used in the same packet diff --git a/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s b/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s new file mode 100644 index 0000000000000..1ddec177e7838 --- /dev/null +++ b/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s @@ -0,0 +1,43 @@ +# RUN: llvm-mc -filetype=obj -arch=hexagon -mcpu=hexagonv67 -mhvx %s | llvm-objdump -d -mcpu=hexagonv67 -mhvx - | FileCheck %s +# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv65 -mhvx -filetype=asm %s 2>%t; FileCheck --check-prefix=CHECK-V65 --implicit-check-not="error:" %s <%t + +v1:0.w = vadd(v0.h, v1.h) // Normal +# CHECK: 1ca1c080 + +v0:1.w = vadd(v0.h, v1.h) // Swapped +# CHECK-NEXT: 1ca1c081 +# CHECK-V65: error: register pair `WR0' is not permitted for this architecture + +## Swapped use: +v1:0.w = vtmpy(v0:1.h,r0.b) +# CHECK-NEXT: 19a0c180 +# CHECK-V65: error: register pair `WR0' is not permitted for this architecture + +## Swapped def +v0:1 = v3:2 +# CHECK-NEXT: 1f42c3e1 { v0:1 = vcombine(v3,v2) } +# CHECK-V65: error: register pair `WR0' is not permitted for this architecture + +# Mapped instruction's swapped use: +v1:0 = v2:3 +# CHECK-NEXT: v1:0 = vcombine(v2,v3) +## No error for v65, this is now permitted! + +## .new producer from pair: +{ + v0:1 = vaddw(v0:1, v0:1) + if (!p0) vmem(r0+#0)=v0.new +} +# CHECK-NEXT: v0:1.w = vadd(v0:1.w,v0:1.w) +# CHECK-NEXT: if (!p0) vmem(r0+#0) = v0.new +# CHECK-V65: error: register pair `WR0' is not permitted for this architecture + +## Used .tmp, swapped use & def: +{ + v0.tmp = vmem(r0 + #3) + v2:3 = vaddw(v0:1, v0:1) +} +# CHECK-NEXT: 1c6141c3 { v2:3.w = vadd(v0:1.w,v0:1.w) +# CHECK-NEXT: v0.tmp = vmem(r0+#3) } +# CHECK-V65: error: register pair `WR0' is not permitted for this architecture +# CHECK-V65: error: register pair `WR1' is not permitted for this architecture diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll index 49144beec3afc..28137f646ace5 100644 --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -101,16 +101,17 @@ ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Reassociate expressions ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG @@ -198,13 +199,12 @@ ; CHECK-NEXT: Float to int ; CHECK-NEXT: Lower constant intrinsics ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll index cbb90eef5ef7c..61b5b9c973b87 100644 --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -106,16 +106,17 @@ ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Reassociate expressions ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG @@ -203,13 +204,12 @@ ; CHECK-NEXT: Float to int ; CHECK-NEXT: Lower constant intrinsics ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll index ce3801388b1e6..81f82d080c709 100644 --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -88,16 +88,17 @@ ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Reassociate expressions ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG @@ -185,13 +186,12 @@ ; CHECK-NEXT: Float to int ; CHECK-NEXT: Lower constant intrinsics ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops diff --git a/llvm/test/Other/pass-pipelines.ll b/llvm/test/Other/pass-pipelines.ll index b3887cf6f969c..718ca46e2ed2a 100644 --- a/llvm/test/Other/pass-pipelines.ll +++ b/llvm/test/Other/pass-pipelines.ll @@ -54,6 +54,7 @@ ; CHECK-O2-NEXT: FunctionPass Manager ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Pass Manager +; CHECK-O2: Loop Pass Manager ; CHECK-O2-NOT: Manager ; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and ; causing new loop pass managers. diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll new file mode 100644 index 0000000000000..bf42d2f5ff646 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s + +; GCN-LABEL: @bswap_v2i16( +; GFX7: call i16 @llvm.bswap.i16( +; GFX7: call i16 @llvm.bswap.i16( + +; GFX8: call <2 x i16> @llvm.bswap.v2i16( +define <2 x i16> @bswap_v2i16(<2 x i16> %arg) { +bb: + %tmp = extractelement <2 x i16> %arg, i64 0 + %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp) + %tmp2 = insertelement <2 x i16> undef, i16 %tmp1, i64 0 + %tmp3 = extractelement <2 x i16> %arg, i64 1 + %tmp4 = tail call i16 @llvm.bswap.i16(i16 %tmp3) + %tmp5 = insertelement <2 x i16> %tmp2, i16 %tmp4, i64 1 + ret <2 x i16> %tmp5 +} + +; GCN-LABEL: @bswap_v2i32( +; GCN: call i32 @llvm.bswap.i32 +; GCN: call i32 @llvm.bswap.i32 +define <2 x i32> @bswap_v2i32(<2 x i32> %arg) { +bb: + %tmp = extractelement <2 x i32> %arg, i64 0 + %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp) + %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i64 0 + %tmp3 = extractelement <2 x i32> %arg, i64 1 + %tmp4 = tail call i32 @llvm.bswap.i32(i32 %tmp3) + %tmp5 = insertelement <2 x i32> %tmp2, i32 %tmp4, i64 1 + ret <2 x i32> %tmp5 +} + +declare i16 @llvm.bswap.i16(i16) #0 +declare i32 @llvm.bswap.i32(i32) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg index 74164e808e476..d9d11b5a06c07 100644 --- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg +++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg @@ -42,11 +42,3 @@ if os.path.isfile(llvm_mca_path): config.available_features.add('llvm-mca-binary') mca_arg = '--llvm-mca-binary ' + shell_quote(llvm_mca_path) add_update_script_substition('%update_test_checks', extra_args=mca_arg) - -clang_path = os.path.join(config.llvm_tools_dir, 'clang') -if os.path.isfile(clang_path): - config.available_features.add('clang-binary') - extra_args = '--clang ' + shell_quote(clang_path) - if os.path.isfile(opt_path): - extra_args += ' --opt ' + shell_quote(opt_path) - add_update_script_substition('%update_cc_test_checks', extra_args=extra_args) diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg deleted file mode 100644 index 99346daabcb06..0000000000000 --- a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -# These tests require clang. -if 'clang-binary' not in config.available_features: - config.unsupported = True diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 6b2608160aea9..54adeaa11c1a2 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -258,7 +258,7 @@ static Expected getOptions(opt::InputArgList &Args) { if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) Options.LinkOpts.Threads = atoi(NumThreads->getValue()); else - Options.LinkOpts.Threads = thread::hardware_concurrency(); + Options.LinkOpts.Threads = 0; // Use all available hardware threads if (Options.DumpDebugMap || Options.LinkOpts.Verbose) Options.LinkOpts.Threads = 1; @@ -541,9 +541,10 @@ int main(int argc, char **argv) { // Shared a single binary holder for all the link steps. BinaryHolder BinHolder; - unsigned ThreadCount = - std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); - ThreadPool Threads(ThreadCount); + unsigned ThreadCount = Options.LinkOpts.Threads; + if (!ThreadCount) + ThreadCount = DebugMapPtrsOrErr->size(); + ThreadPool Threads(hardware_concurrency(ThreadCount)); // If there is more than one link to execute, we need to generate // temporary files. diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp index f68f7183e034c..2a9398fed9f35 100644 --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -134,8 +134,8 @@ namespace options { static unsigned OptLevel = 2; // Default parallelism of 0 used to indicate that user did not specify. // Actual parallelism default value depends on implementation. - // Currently only affects ThinLTO, where the default is - // llvm::heavyweight_hardware_concurrency. + // Currently only affects ThinLTO, where the default is the max cores in the + // system. static unsigned Parallelism = 0; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp index 52e9958e92da9..625e2342e4bb3 100644 --- a/llvm/tools/llvm-cov/CodeCoverage.cpp +++ b/llvm/tools/llvm-cov/CodeCoverage.cpp @@ -947,9 +947,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv, // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); + NumThreads = SourceFiles.size(); if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) { for (const std::string &SourceFile : SourceFiles) @@ -957,7 +955,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv, ShowFilenames); } else { // In -output-dir mode, it's safe to use multiple threads to print files. - ThreadPool Pool(NumThreads); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); for (const std::string &SourceFile : SourceFiles) Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile, Coverage.get(), Printer.get(), ShowFilenames); diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp index 216b5e3fd2263..ba8ff5c8fe523 100644 --- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -163,11 +163,9 @@ json::Array renderFiles(const coverage::CoverageMapping &Coverage, ArrayRef FileReports, const CoverageViewOptions &Options) { auto NumThreads = Options.NumThreads; - if (NumThreads == 0) { - NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); - } - ThreadPool Pool(NumThreads); + if (NumThreads == 0) + NumThreads = SourceFiles.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); json::Array FileArray; std::mutex FileArrayMutex; diff --git a/llvm/tools/llvm-cov/CoverageReport.cpp b/llvm/tools/llvm-cov/CoverageReport.cpp index 82259542c5970..187e2dc4f553a 100644 --- a/llvm/tools/llvm-cov/CoverageReport.cpp +++ b/llvm/tools/llvm-cov/CoverageReport.cpp @@ -356,11 +356,8 @@ std::vector CoverageReport::prepareFileReports( // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(Files.size()))); - - ThreadPool Pool(NumThreads); + NumThreads = Files.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); std::vector FileReports; FileReports.reserve(Files.size()); diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h index d1e20b9b36a83..65a3fe61aecb0 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -162,11 +162,28 @@ class CombinationGenerator { SmallVector, variable_smallsize> VariablesState; + // 'increment' of the the whole VariablesState is defined identically to the + // increment of a number: starting from the least significant element, + // increment it, and if it wrapped, then propagate that carry by also + // incrementing next (more significant) element. + auto IncrementState = + [](MutableArrayRef> VariablesState) + -> bool { + for (WrappingIterator &Variable : + llvm::reverse(VariablesState)) { + bool Wrapped = ++Variable; + if (!Wrapped) + return false; // There you go, next combination is ready. + // We have carry - increment more significant variable next.. + } + return true; // MSB variable wrapped, no more unique combinations. + }; + // Initialize the per-variable state to refer to the possible choices for // that variable. VariablesState.reserve(VariablesChoices.size()); - for (ArrayRef VariablesChoices : VariablesChoices) - VariablesState.emplace_back(VariablesChoices); + for (ArrayRef VC : VariablesChoices) + VariablesState.emplace_back(VC); // Temporary buffer to store each combination before performing Callback. SmallVector CurrentCombination; @@ -179,23 +196,9 @@ class CombinationGenerator { // And pass the new combination into callback, as intended. if (/*Abort=*/Callback(CurrentCombination)) return; - - // 'increment' the whole VariablesState, much like you would increment - // a number: starting from the least significant element, increment it, - // and if it wrapped, then propagate that carry by also incrementing next - // (more significant) element. - for (WrappingIterator &VariableState : - llvm::reverse(VariablesState)) { - bool Wrapped = ++VariableState; - if (!Wrapped) - break; - - if (VariablesState.begin() == &VariableState) - return; // The "most significant" variable has wrapped, which means - // that we have produced all the combinations. - - // We have carry - increment more significant variable next.. - } + // And tick the state to next combination, which will be unique. + if (IncrementState(VariablesState)) + return; // All combinations produced. } }; diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 8deedd49e0501..fc86fd969efba 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -65,8 +65,8 @@ static cl::opt "import files for the " "distributed backend case")); -static cl::opt Threads("thinlto-threads", - cl::init(llvm::heavyweight_hardware_concurrency())); +// Default to using all hardware cores in the system. +static cl::opt Threads("thinlto-threads", cl::init(0)); static cl::list SymbolResolutions( "r", diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 424edf446d035..f05c7e637cd55 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -307,8 +307,11 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs, // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2)); + NumThreads = std::min(hardware_concurrency().compute_thread_count(), + unsigned((Inputs.size() + 1) / 2)); + // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails + // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't + // merged, thus the emitted file ends up with a PF_Unknown kind. // Initialize the writer contexts. SmallVector, 4> Contexts; @@ -320,7 +323,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs, for (const auto &Input : Inputs) loadInput(Input, Remapper, Contexts[0].get()); } else { - ThreadPool Pool(NumThreads); + ThreadPool Pool(hardware_concurrency(NumThreads)); // Load the inputs in parallel (N/NumThreads serial steps). unsigned Ctx = 0; diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp index 8191ec86e61fa..c3a9bda3c817a 100644 --- a/llvm/unittests/ADT/APIntTest.cpp +++ b/llvm/unittests/ADT/APIntTest.cpp @@ -1815,6 +1815,14 @@ TEST(APIntTest, SelfMoveAssignment) { #endif #endif // _MSC_VER +TEST(APIntTest, byteSwap) { + EXPECT_EQ(0x00000000, APInt(16, 0x0000).byteSwap()); + EXPECT_EQ(0x0000010f, APInt(16, 0x0f01).byteSwap()); + EXPECT_EQ(0x117700ff, APInt(32, 0xff007711).byteSwap()); + EXPECT_EQ(0x050403020100ULL, APInt(48, 0x000102030405ULL).byteSwap()); + EXPECT_EQ(0xff050403020100aaULL, APInt(64, 0xaa000102030405ffULL).byteSwap()); +} + TEST(APIntTest, reverseBits) { EXPECT_EQ(1, APInt(1, 1).reverseBits()); EXPECT_EQ(0, APInt(1, 0).reverseBits()); diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp index 5d3830972a626..c933388361411 100644 --- a/llvm/unittests/ADT/BitVectorTest.cpp +++ b/llvm/unittests/ADT/BitVectorTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallBitVector.h" #include "gtest/gtest.h" @@ -1149,4 +1150,40 @@ TYPED_TEST(BitVectorTest, PushBack) { EXPECT_EQ(213U, Vec.size()); EXPECT_EQ(102U, Vec.count()); } + +TYPED_TEST(BitVectorTest, DenseSet) { + DenseSet Set; + TypeParam A(10, true); + auto I = Set.insert(A); + EXPECT_EQ(true, I.second); + + TypeParam B(5, true); + I = Set.insert(B); + EXPECT_EQ(true, I.second); + + TypeParam C(20, false); + C.set(19); + I = Set.insert(C); + EXPECT_EQ(true, I.second); + +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + TypeParam D; + EXPECT_DEATH(Set.insert(D), + "Empty/Tombstone value shouldn't be inserted into map!"); +#endif + + EXPECT_EQ(3U, Set.size()); + EXPECT_EQ(1U, Set.count(A)); + EXPECT_EQ(1U, Set.count(B)); + EXPECT_EQ(1U, Set.count(C)); + + EXPECT_EQ(true, Set.erase(B)); + EXPECT_EQ(2U, Set.size()); + + EXPECT_EQ(true, Set.erase(C)); + EXPECT_EQ(1U, Set.size()); + + EXPECT_EQ(true, Set.erase(A)); + EXPECT_EQ(0U, Set.size()); } +} // namespace diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp index 0552b5ad6f7ba..a622c84c1f4d6 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp @@ -670,7 +670,7 @@ TEST_F(DebugLineBasicFixture, ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u); EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u); // Show that the set address opcode is ignored in this case. - EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0); + EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0u); } TEST_F(DebugLineBasicFixture, ErrorForAddressSizeGreaterThanByteSize) { @@ -731,7 +731,7 @@ TEST_F(DebugLineBasicFixture, ErrorForUnsupportedAddressSizeDefinedInHeader) { ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u); EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u); // Show that the set address opcode is ignored in this case. - EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0); + EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0u); } TEST_F(DebugLineBasicFixture, CallbackUsedForUnterminatedSequence) { diff --git a/llvm/unittests/Frontend/OpenMPContextTest.cpp b/llvm/unittests/Frontend/OpenMPContextTest.cpp index 8741b825cb61e..eb505be042cb3 100644 --- a/llvm/unittests/Frontend/OpenMPContextTest.cpp +++ b/llvm/unittests/Frontend/OpenMPContextTest.cpp @@ -38,12 +38,11 @@ TEST_F(OpenMPContextTest, RoundTripAndAssociation) { #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ EXPECT_EQ(TraitProperty::Enum, \ getOpenMPContextTraitPropertyKind( \ - TraitSet::TraitSetEnum, TraitSelector::TraitSelectorEnum, \ + TraitSet::TraitSetEnum, \ getOpenMPContextTraitPropertyName(TraitProperty::Enum))); \ - EXPECT_EQ( \ - Str, \ - getOpenMPContextTraitPropertyName(getOpenMPContextTraitPropertyKind( \ - TraitSet::TraitSetEnum, TraitSelector::TraitSelectorEnum, Str))); \ + EXPECT_EQ(Str, getOpenMPContextTraitPropertyName( \ + getOpenMPContextTraitPropertyKind(TraitSet::TraitSetEnum, \ + Str))); \ EXPECT_EQ(TraitSet::TraitSetEnum, \ getOpenMPContextTraitSetForProperty(TraitProperty::Enum)); \ EXPECT_EQ(TraitSelector::TraitSelectorEnum, \ diff --git a/llvm/unittests/Support/Host.cpp b/llvm/unittests/Support/Host.cpp index 2c17a5094101e..62252347d62aa 100644 --- a/llvm/unittests/Support/Host.cpp +++ b/llvm/unittests/Support/Host.cpp @@ -37,7 +37,8 @@ class HostTest : public testing::Test { // Initially this is only testing detection of the number of // physical cores, which is currently only supported/tested for // x86_64 Linux and Darwin. - return (Host.getArch() == Triple::x86_64 && + return Host.isOSWindows() || + (Host.getArch() == Triple::x86_64 && (Host.isOSDarwin() || Host.getOS() == Triple::Linux)); } diff --git a/llvm/unittests/Support/TaskQueueTest.cpp b/llvm/unittests/Support/TaskQueueTest.cpp index 0a8aeca4e2d6f..4d8c3e4064b49 100644 --- a/llvm/unittests/Support/TaskQueueTest.cpp +++ b/llvm/unittests/Support/TaskQueueTest.cpp @@ -22,7 +22,7 @@ class TaskQueueTest : public testing::Test { }; TEST_F(TaskQueueTest, OrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -66,7 +66,7 @@ TEST_F(TaskQueueTest, OrderedFutures) { } TEST_F(TaskQueueTest, UnOrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -96,7 +96,7 @@ TEST_F(TaskQueueTest, UnOrderedFutures) { } TEST_F(TaskQueueTest, FutureWithReturnValue) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::future F1 = TQ.async([&] { return std::string("Hello"); }); std::future F2 = TQ.async([&] { return 42; }); diff --git a/llvm/unittests/Support/ThreadPool.cpp b/llvm/unittests/Support/ThreadPool.cpp index a16adbbb78a75..237be875909b7 100644 --- a/llvm/unittests/Support/ThreadPool.cpp +++ b/llvm/unittests/Support/ThreadPool.cpp @@ -8,11 +8,13 @@ #include "llvm/Support/ThreadPool.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/Host.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "gtest/gtest.h" @@ -69,6 +71,8 @@ class ThreadPoolTest : public testing::Test { void SetUp() override { MainThreadReady = false; } + void TestAllThreads(ThreadPoolStrategy S); + std::condition_variable WaitMainThread; std::mutex WaitMainThreadMutex; bool MainThreadReady = false; @@ -131,7 +135,7 @@ TEST_F(ThreadPoolTest, Async) { TEST_F(ThreadPoolTest, GetFuture) { CHECK_UNSUPPORTED(); - ThreadPool Pool{2}; + ThreadPool Pool(hardware_concurrency(2)); std::atomic_int i{0}; Pool.async([this, &i] { waitForMainThread(); @@ -162,3 +166,45 @@ TEST_F(ThreadPoolTest, PoolDestruction) { } ASSERT_EQ(5, checked_in); } + +#if LLVM_ENABLE_THREADS == 1 + +void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) { + // FIXME: Skip these tests on non-Windows because multi-socket system were not + // tested on Unix yet, and llvm::get_thread_affinity_mask() isn't implemented + // for Unix. + Triple Host(Triple::normalize(sys::getProcessTriple())); + if (!Host.isOSWindows()) + return; + + llvm::DenseSet ThreadsUsed; + std::mutex Lock; + unsigned Threads = 0; + { + ThreadPool Pool(S); + Threads = Pool.getThreadCount(); + for (size_t I = 0; I < 10000; ++I) { + Pool.async([&] { + waitForMainThread(); + std::lock_guard Guard(Lock); + auto Mask = llvm::get_thread_affinity_mask(); + ThreadsUsed.insert(Mask); + }); + } + ASSERT_EQ(true, ThreadsUsed.empty()); + setMainThreadReady(); + } + ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size()); +} + +TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) { + CHECK_UNSUPPORTED(); + TestAllThreads({}); +} + +TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) { + CHECK_UNSUPPORTED(); + TestAllThreads(llvm::heavyweight_hardware_concurrency()); +} + +#endif diff --git a/llvm/unittests/Support/Threading.cpp b/llvm/unittests/Support/Threading.cpp index 183c9aa7d71c4..c76e6e4a5bd17 100644 --- a/llvm/unittests/Support/Threading.cpp +++ b/llvm/unittests/Support/Threading.cpp @@ -21,7 +21,8 @@ TEST(Threading, PhysicalConcurrency) { auto Num = heavyweight_hardware_concurrency(); // Since Num is unsigned this will also catch us trying to // return -1. - ASSERT_LE(Num, thread::hardware_concurrency()); + ASSERT_LE(Num.compute_thread_count(), + hardware_concurrency().compute_thread_count()); } #if LLVM_ENABLE_THREADS diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn index 7947a367cf288..b74268996f150 100644 --- a/llvm/utils/gn/build/BUILD.gn +++ b/llvm/utils/gn/build/BUILD.gn @@ -59,8 +59,8 @@ config("compiler_defaults") { } if (is_optimized) { cflags += [ - # FIXME: evaluate /Gw (not part of /O2) "/O2", + "/Gw", "/Zc:inline", ] ldflags += [ diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py index 21cc5b4e5e302..9b236dbd24319 100755 --- a/llvm/utils/update_cc_test_checks.py +++ b/llvm/utils/update_cc_test_checks.py @@ -292,10 +292,10 @@ def main(): False, args.function_signature) output_lines.append(line.rstrip('\n')) - # Update the test file. - with open(filename, 'w') as f: - for line in output_lines: - f.write(line + '\n') + + common.debug('Writing %d lines to %s...' % (len(output_lines), filename)) + with open(filename, 'wb') as f: + f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines]) return 0 diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h index 53f2fea1ccfc6..cf6316cae643a 100644 --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -171,9 +171,11 @@ struct AffineCopyOptions { /// by its root affine.for. Since we generate alloc's and dealloc's for all fast /// buffers (before and after the range of operations resp. or at a hoisted /// position), all of the fast memory capacity is assumed to be available for -/// processing this block range. +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests); /// Tile a nest of standard for loops rooted at `rootForOp` by finding such @@ -220,6 +222,11 @@ void coalesceLoops(MutableArrayRef loops); /// ``` void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors); + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void gatherLoops(FuncOp func, + DenseMap> &depthToLoops); + } // end namespace mlir #endif // MLIR_TRANSFORMS_LOOP_UTILS_H diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp index fc40ff18ce06f..41adb623e74d6 100644 --- a/mlir/lib/Pass/Pass.cpp +++ b/mlir/lib/Pass/Pass.cpp @@ -411,7 +411,8 @@ void OpToOpPassAdaptorParallel::runOnOperation() { // Create the async executors if they haven't been created, or if the main // pipeline has changed. if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs)) - asyncExecutors.assign(llvm::hardware_concurrency(), mgrs); + asyncExecutors.assign(llvm::hardware_concurrency().compute_thread_count(), + mgrs); // Run a prepass over the module to collect the operations to execute over. // This ensures that an analysis manager exists for each operation, as well as diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp index 449dcfafeceb0..5409c557da83e 100644 --- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp @@ -179,7 +179,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, if ((forOp = dyn_cast(&*it)) && copyNests.count(forOp) == 0) { // Perform the copying up unti this 'for' op first. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); // Returns true if the footprint is known to exceed capacity. auto exceedsCapacity = [&](AffineForOp forOp) { @@ -213,7 +213,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, // consumed capacity. The footprint check above guarantees this inner // loop's footprint fits. affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); } // Get to the next load or store op after 'forOp'. curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { @@ -236,7 +236,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, assert(!curBegin->isKnownTerminator() && "can't be a terminator"); // Exclude the affine terminator - hence, the std::prev. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), - copyOptions, copyNests); + copyOptions, /*filterMemRef=*/llvm::None, copyNests); } return success(); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 56f954f214225..da3d819cbc3ed 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1585,16 +1585,21 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs, return true; } -/// Generates copies for a contiguous sequence of operations in `block` in the -/// iterator range [`begin', `end'), where `end' can't be past the terminator of -/// the block (since additional operations are potentially inserted right before -/// `end'. Returns the total size of the fast buffers used. -// Since we generate alloc's and dealloc's for all fast buffers (before and -// after the range of operations resp.), all of the fast memory capacity is -// assumed to be available for processing this block range. +/// Performs explicit copying for the contiguous sequence of operations in the +/// block iterator range [`begin', `end'), where `end' can't be past the +/// terminator of the block (since additional operations are potentially +/// inserted right before `end`. Returns the total size of fast memory space +/// buffers used. `copyOptions` provides various parameters, and the output +/// argument `copyNests` is the set of all copy nests inserted, each represented +/// by its root affine.for. Since we generate alloc's and dealloc's for all fast +/// buffers (before and after the range of operations resp. or at a hoisted +/// position), all of the fast memory capacity is assumed to be available for +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests) { if (begin == end) return 0; @@ -1631,12 +1636,14 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, block->walk(begin, end, [&](Operation *opInst) { // Gather regions to allocate to buffers in faster memory space. if (auto loadOp = dyn_cast(opInst)) { - if ((loadOp.getMemRefType().getMemorySpace() != + if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) || + (loadOp.getMemRefType().getMemorySpace() != copyOptions.slowMemorySpace)) return; } else if (auto storeOp = dyn_cast(opInst)) { - if (storeOp.getMemRefType().getMemorySpace() != - copyOptions.slowMemorySpace) + if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) || + storeOp.getMemRefType().getMemorySpace() != + copyOptions.slowMemorySpace) return; } else { // Neither load nor a store op. @@ -1776,3 +1783,24 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, return totalCopyBuffersSizeInBytes; } + +/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. +static void gatherLoopsInBlock( + Block *block, unsigned currLoopDepth, + DenseMap> &depthToLoops) { + auto &loopsAtDepth = depthToLoops[currLoopDepth]; + for (auto &op : *block) { + if (auto forOp = dyn_cast(op)) { + loopsAtDepth.push_back(forOp); + gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops); + } + } +} + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void mlir::gatherLoops( + FuncOp func, + DenseMap> &depthToLoops) { + for (auto &block : func) + gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops); +} diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir index c83beb183021b..b2e4fbbf76c19 100644 --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -2,6 +2,12 @@ // Small buffer size to trigger fine copies. // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s +// Test affine data copy with a memref filter. We use a test pass that invokes +// affine data copy utility on the input loop nest. +// '-test-affine-data-copy-memref-filter' passes the first memref found in an +// affine.load op in the innermost loop as a filter. +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER + // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory // footprint -- so that one could write a definite test case and not have to @@ -16,6 +22,7 @@ // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)> // CHECK-LABEL: func @matmul +// FILTER-LABEL: func @matmul func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> { affine.for %i = 0 to 4096 step 128 { affine.for %j = 0 to 4096 step 128 { @@ -110,11 +117,29 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // CHECK: } // CHECK: } +// Check that only one memref is copied when memref filter is used. + +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER: alloc() : memref<128x4096xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 128 { +// FILTER: affine.for %{{.*}} = 0 to 4096 { +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER: dealloc %1 : memref<128x4096xf32> +// FILTER-NOT: dealloc %1 : memref<128x4096xf32> + +// ----- + // // This test case will lead to single element buffers. These are eventually // expected to be turned into registers via alloca and mem2reg. // -// CHECK-SMALL: func @foo +// CHECK-SMALL-LABEL: func @foo +// FILTER-LABEL: func @foo func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { affine.for %i = 0 to 1024 { affine.for %j = 0 to 1024 { @@ -161,3 +186,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem // CHECK-SMALL: } // CHECK-SMALL: } // CHECK-SMALL: return + +// Check that only one memref is copied when memref filter is used. + +// FILTER: alloc() : memref<1024x1024xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER: dealloc %{{.*}} : memref<1024x1024xf32> +// FILTER-NOT: dealloc diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir index 9724f990f97ca..b1e71e694690b 100644 --- a/mlir/test/Transforms/dma-generate.mlir +++ b/mlir/test/Transforms/dma-generate.mlir @@ -543,7 +543,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, // CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> // CHECK: affine.for %{{.*}} = -// ---- +// ----- #map3 = affine_map<(d0) -> (d0)> #map12 = affine_map<(d0) -> (d0 + 3)> @@ -551,6 +551,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)> #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)> // Test for test case in b/128303048 #4. +// CHECK-LABEL: func @test_memref_bounds func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) { %c0 = constant 0 : index affine.for %i8 = 0 to 9 step 3 { diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index 47a0dd92cd062..8c422e718f1fd 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(MLIRTestTransforms + TestAffineDataCopy.cpp TestAllReduceLowering.cpp TestCallGraph.cpp TestConstantFold.cpp diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp new file mode 100644 index 0000000000000..e03d45cb9dd45 --- /dev/null +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -0,0 +1,86 @@ +//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to test affine data copy utility functions and +// options. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Passes.h" +#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +#define PASS_NAME "test-affine-data-copy" + +using namespace mlir; + +static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); + +namespace { + +struct TestAffineDataCopy : public FunctionPass { + TestAffineDataCopy() = default; + TestAffineDataCopy(const TestAffineDataCopy &pass){}; + + void runOnFunction() override; + +private: + Option clMemRefFilter{ + *this, "memref-filter", + llvm::cl::desc( + "Enable memref filter testing in affine data copy optimization"), + llvm::cl::init(false)}; +}; + +} // end anonymous namespace + +void TestAffineDataCopy::runOnFunction() { + // Gather all AffineForOps by loop depth. + DenseMap> depthToLoops; + gatherLoops(getFunction(), depthToLoops); + assert(depthToLoops.size() && "Loop nest not found"); + + // Only support tests with a single loop nest and a single innermost loop + // for now. + unsigned innermostLoopIdx = depthToLoops.size() - 2; + if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1) + return; + + auto loopNest = depthToLoops[0][0]; + auto innermostLoop = depthToLoops[innermostLoopIdx][0]; + Optional memrefFilter; + if (clMemRefFilter) { + // Gather MemRef filter. For simplicity, we use the first loaded memref + // found in the innermost loop. + for (auto &op : *innermostLoop.getBody()) { + if (auto load = dyn_cast(op)) { + memrefFilter = load.getMemRef(); + break; + } + } + } + + AffineCopyOptions copyOptions = {/*generateDma=*/false, + /*slowMemorySpace=*/0, + /*fastMemorySpace=*/0, + /*tagMemorySpace=*/0, + /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; + DenseSet copyNests; + affineDataCopyGenerate(loopNest.getBody()->begin(), + std::prev(loopNest.getBody()->end()), copyOptions, + memrefFilter, copyNests); +} + +namespace mlir { +void registerTestAffineDataCopyPass() { + PassRegistration( + PASS_NAME, "Tests affine data copy utility functions."); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp index 9ffa347173f65..9214fa9fc4333 100644 --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopFusionUtils.h" +#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" @@ -54,19 +55,6 @@ struct TestLoopFusion : public FunctionPass { } // end anonymous namespace -// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. -static void -gatherLoops(Block *block, unsigned currLoopDepth, - DenseMap> &depthToLoops) { - auto &loopsAtDepth = depthToLoops[currLoopDepth]; - for (auto &op : *block) { - if (auto forOp = dyn_cast(op)) { - loopsAtDepth.push_back(forOp); - gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops); - } - } -} - // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths // in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists. @@ -194,8 +182,7 @@ void TestLoopFusion::runOnFunction() { do { depthToLoops.clear(); // Gather all AffineForOps by loop depth. - for (auto &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'. } while (iterateLoops(depthToLoops, testLoopFusionTransformation, @@ -204,8 +191,7 @@ void TestLoopFusion::runOnFunction() { } // Gather all AffineForOps by loop depth. - for (Block &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Run tests on all combinations of src/dst loop nests in 'depthToLoops'. if (clTestDependenceCheck) diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index bf6b57c2b6247..4df330e77bcd8 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -34,6 +34,7 @@ void registerPassManagerTestPass(); void registerPatternsTestPass(); void registerSimpleParametricTilingPass(); void registerSymbolTestPasses(); +void registerTestAffineDataCopyPass(); void registerTestAllReduceLoweringPass(); void registerTestCallGraphPass(); void registerTestConstantFold(); @@ -85,6 +86,7 @@ void registerTestPasses() { registerPatternsTestPass(); registerSimpleParametricTilingPass(); registerSymbolTestPasses(); + registerTestAffineDataCopyPass(); registerTestAllReduceLoweringPass(); registerTestCallGraphPass(); registerTestConstantFold();